*code gets slers ready for analysis, gets 2018 cands ready for analysis, runs model, and does simulations on results.  

*The NY Senate Independent Democratic Conference alteration has been taken out of this run.  

*Small mistake in how super-majorities were calculated, see that section below.  

clear
version 15.1
set varabbrev off, permanently
cd E:\Dropbox\08_FORECAST\005runs\run20181030

global datezzz 20181030

*put the name of the slers file used here
global slersfile 001_196slers1967to2016_20180908
*put name of wave file used here
global wavefile 003_008wave20181029
*Put the name of the file with the 2018 cands used here
global newcands 006_147filers20181030

*put the name of the file with 2018 cases that is to be created here
global cases2018 102_2018cases$datezzz
*put name of main file that is to be created here
global mainfile 103mainfile$datezzz
*Put name of the drop-1 file that is to be created here.  
global drop1file 112drop1file$datezzz

*GOVPARTY/SMIDPEN
clear
import excel 011_govs20180510bshortened.xlsx, firstrow
tab govpartyb govpartyb2
*perfect cor, good
drop govpartyb state sessyear
rename electyear year
rename govpartyb2 govparty
*governor's party is put in the session year after they win election right now, so govparty must be brought forward one year, in other words lagged.  
tsset stateno year
by stateno: gen lag=govparty[_n-1]
drop govparty
rename lag govparty
gen smidpen=0 if gubelection==1
replace smidpen=-1 if gubelection==0&govparty==1
replace smidpen=1 if gubelection==0&govparty==-1
replace smidpen=0 if gubelection==0&govparty==0
save tempgub, replace

*CHAMBER SEATS
*Get number of seats in each chamber
clear
import excel 002_StatePartisanBalance1777to2016_20171027_SourceFiles.xlsx, firstrow
keep if sessyear==2017
gen sen=chambercode==8
rename stateno sid
keep sid sen totinsess
save tempchamberseats, replace

*2018 NC REDIST
*Not everything is needed from this file for the immediate purpose its used for.  The other variables will be brought in later.  
clear
use 012_103estnclag20180819
keep sid sen dno redistbrian redist2brian
save tempncredist, replace

*2018 SKELETON
*This keeps all seats in the country currently, not just those up in 2018, with the exception of the odd-year states and non-partisan Nebraska.  Some of the state senate seats have sg elections, and it will be time consuming to get their nesting info later.  
clear
use $slersfile
drop if deter==0
drop if year<2014|(sid==9&sen==1&year==2014)
drop if sid==18|sid==24|sid==30|sid==46|sid==27
*There are two etype=gs elections in FL in 2016.
replace termz=2 if year==2016&sid==9&sen==1&(dno==23|dno==25)
*only keep the last election for which a seat was up in.  
egen max=max(year), by(sid sen dname dno geopost mmdpost)
keep if year==max
rename year lastup
gen nextup=lastup+termz
gen c=1
collapse (sum) c, by(sid sen dname dno geopost mmdpost nest nest1 nest2 nest3 eseats dtype cand partyt outcome nextup)
gen dheld=outcome=="w"&partyt=="d"
gen rheld=outcome=="w"&partyt=="r"
gen oheld=outcome=="w"&partyt!="d"&partyt!="r"
collapse (sum) dheld rheld oheld, by(sid sen dname dno geopost mmdpost nest nest1 nest2 nest3 eseats dtype nextup)
replace nextup=2018 if nextup==2018.5
*confirm the following vars identify
bysort sid sen dname dno geopost mmdpost: gen temp=_N
assert temp==1
drop temp
*confirm all seats accounted for
gen dif=eseats-(dheld+rheld+oheld)
assert dif==0
drop dif
*Bring in NC redistricting data
merge m:1 sid sen dno using tempncredist
erase tempncredist.dta
drop _merge
rename redistbrian redist
rename redist2brian redist2
gen year=2018
replace redist=0 if redist==.
save $cases2018, replace

*CHECKSEATS
*confirm that every seat is accounted for.  
clear
use $cases2018
collapse (sum) eseats, by(sid sen)
merge 1:1 sid sen using tempchamberseats
erase tempchamberseats.dta
gen dif=totinsess-eseats
assert dif==0|dif==.
*All 0, all chambers accounted for.
*browse
*everything looks cool
clear

*PROPUP
*Create file with the proportion of seats up in each chamber here.  
*Get total seats
clear
import excel 002_StatePartisanBalance1777to2016_20171027_SourceFiles.xlsx, firstrow
rename electyear year
drop if year<1967
gen sen=chambercode==8
rename stateno sid
keep year sid sen totinsess
save temp, replace
clear
use $slersfile
keep if deter==1&outcome=="w"
gen temp=mod(termz,1)
replace year=year-1 if temp==.5
collapse (mean) eseats, by(year sid sen dname dno geopost mmdpost cand)
gen c=1
collapse (mean) eseats (sum) c, by(year sid sen dname dno geopost mmdpost)
replace eseats=2 if sid==45&sen==0&year==1986&dname=="orleans"&dno==3
gen dif=eseats-c
assert dif==0
drop c dif
collapse (sum) eseats, by(year sid sen)
merge 1:1 year sid sen using temp
gen propup=eseats/totinsess
assert propup<=1|propup==.
*seatprop never more than 1, good
drop eseats totinsess _merge
drop if propup==.
list if propup>.9&propup!=1
*that should be ak 2012 sen propup=.95, if its anything else or more, there could potentially be a mistake.  
save tempseatprop, replace

*2018CANDS
clear
use $newcands

*LOSERS & WRITEINS
*Get rid of people who lost in primary, or dropped out before the primary, or were writeins.  
*First, see if losers and writeins ever constitute every individual in an election.  That would be bad.  
gen temp=outcome=="d"|(outcome=="l"&(etype=="dp"|etype=="rp"))|primaryoutcome=="l"
tab primaryoutcome if partyz=="writein"
*6 winners are partyz=writein.
tab sid if primaryoutcome=="w"&partyz=="writein"
*All in IL.  
list etype if primaryoutcome=="w"&partyz=="writein"
*these are all in dem or repub primaries.  
gen temp2=primaryoutcome=="w"&partyz=="writein"
egen max=max(temp2), by(sid sen dname dno geopost mmdpost)
rename votes vote
sort dno etype last first vote
list dno etype last first vote partyz outcome primaryoutcome if max==1
*2 out of 6 shouldn't be winners.  The other four are running in uncont primaries.  Depending on the law RE writeins in primaries, they may or may not be winners.  
list dno etype last first vote partyz outcome primaryoutcome if sid==13&sen==0
list dno etype last first vote partyz outcome primaryoutcome if sid==13&sen==1
*get rid of the writeins, as you did before.  
drop temp2 max
replace temp=1 if partyz=="writein"
egen min=min(temp), by(sid sen dname dno geopost mmdpost)
assert min==0
*min is always 0, good
drop temp min
*The second clause gets rid of people that ballotpedia 1) didn't list as a winner and 2) didn't list as someone who went to a runoff primary.  
tab outcome
*Given the possible values of outcome, the following will work.  
gen drop=outcome=="d"|outcome=="l"|primaryoutcome=="l"|partyz=="writein"
egen min=min(drop), by(sid sen dname dno geopost mmdpost)
assert min==0
*awesome, there is at least one case not being dropped in each district.  
drop drop min
drop if outcome=="d"|outcome=="l"|primaryoutcome=="l"|partyz=="writein"

*UOA CAND
*Make the unit of analysis a candidate, not a candidate-party.  
gen d=partyz=="d"
gen r=partyz=="r"
gen o=partyz=="o"
*Are there Dem-Repubs in NY and RI (those are the only places with multiple lines for one cand)?
egen maxd=max(d), by(sid sen dname dno geopost mmdpost last first)
egen maxr=max(r), by(sid sen dname dno geopost mmdpost last first)
gen maxb=maxd&maxr
tab sid maxb
*There are only 7 cases of this.
list sid sen dname dno geopost mmdpost last first middle suffix partyz exper candid if maxb==1
*It's two people, and one is an incumbent.  Simcha feldman was coded partyz=dem in slers in prior years.  However, he caucuses with the Republicans according to the following. http://thehill.com/homenews/state-watch/384745-dems-win-majority-in-new-york-senate-but-wont-control-it.  I'm making him a Republican.  
replace partyz="r" if sid==32&sen==1&dno==17&last=="felder"
*the second candidate is ashley Zanatta, who wasn't matched with slers.  NY Assembly #62.  According to slers, there was no dem in that district in 2016.  There was one candidate, repub, conservative, independent and reform.  No dem in 2014 either, same story.  winner in 2014 didn't run in 2016 and winner in 2016 didn't run in 2018.  
*Ballotpedia has some new cands in that race I didn't know about.  So ashley is the repub, and she's running in the dem primary, but there are two other cands running in the dem primary.  So depending on how the dem primary goes, if we consider her partyz=repub, then the race might be major party uncontested, or contested.  Since this seat went repub in the last two elections (uncontested repub) I'm leaving this as it is and classifying her partyz as repub.  Ashley is said to be a republican in the following link, but she was also endorsed by the local dem party.  https://www.silive.com/news/2018/06/staten_island_dems_endorse_ash.html accessed August 7, 2018.  
replace partyz="r" if sid==32&sen==0&dno==62&last=="zanatta"
*Make the unit of analysis a candidate, not a candidate-party.  Non-maj parties are ignored if someone is either a dem or a repub.  
*the following sc case is also dealt with with the following: sc hs #106, last=="gause"
drop if (maxd==1&partyz=="nonmaj")|(maxr==1&partyz=="nonmaj")
bysort sid sen dname dno geopost mmdpost last first: gen rows=_N
tab sid rows
*9 multirow cands left in NY.  Those are probably multirow non-maj party cands.  
list sen dno last first partyz if rows!=1
*2 of those are those, the other 7 are the two dem-repubs, which makes sense.  
bysort sid sen dname dno geopost mmdpost last first: gen row=_n
drop if rows!=1&row!=1
*6 cases dropped, perfect.  
drop d r maxd maxr maxb rows row
bysort sid sen dname dno geopost mmdpost last first: assert _N==1

*MERGE
*merge 2018 cands with 2018 skeleton
merge m:1 sid sen dname dno geopost mmdpost using $cases2018
assert _merge!=1
*no merge=1, good, just as planned.
tab sid sen if _merge==2
*2 cases in ma didn't merge / are merge=2, not good
*2 cases in vt are same
*Those are the only obviously problematic merge=2 cases.  The other cases are either nd hs, or state senates where not everyone is up.
list sid sen dname dno geopost last first middle if _merge==2&(sid==21|sid==45)
*no name for those districts either.  
*I believe these are districts that had filers, but the filers weren't in the lists I obtained.  
*Ballotpedia says that there are no candidates of any kind in those four districts.
*For now, I'm arbitrarily going to say those districts will all be won by Dems.  I have to move on.  
replace partyz="d" if partyz==""&(sid==21|sid==45)
*4 changes, perfect
rename _merge merge1

*make the following consistent with what will be used from slers
rename partyz partyt

save $cases2018, replace



*SLERs
clear
use $slersfile

*Correction bcs of two etype=gs elections.  
replace termz=2 if year==2016&sid==9&sen==1&(dno==23|dno==25)

*RUNOFFS
*for the three runoff general elections in the dataset, change the winner of the runoff to the winner in the first round, and change deter=0, and changes eseats.  The first round can be used for vote share, but now the winner doesn't correspond necessarily to the highest vote getter.  But the ultimate winner in the runoff is preserved so that winners can be aggregated when appropriate.  
*GA 1968 HS
gen temp=sid==10&sen==0&dno==73&mmdpost==2&year==1968
list etype deter cand outcome eseats dtype caseid if temp
replace outcome="w" if caseid==46053
replace outcome="l" if caseid==46052
replace eseats=1 if temp&etype=="gfunset"
replace deter=1 if temp&etype=="gfunset"
replace deter=0 if temp&etype=="grunoff"
*GA 2010 HS
replace temp=sid==10&sen==0&dno==29&year==2000
list etype deter cand outcome eseats dtype caseid if temp
replace outcome="w" if caseid==42769
replace outcome="l" if caseid==42772
replace eseats=1 if temp&etype=="gfunset"
replace deter=1 if temp&etype=="gfunset"
replace deter=0 if temp&etype=="grunoff"
*VT 1986 HS
replace temp=sid==45&sen==0&dname=="orleans"&dno==3&year==1986
list etype deter cand outcome eseats dtype caseid if temp
replace outcome="w" if caseid==240180
replace outcome="l" if caseid==240178
replace eseats=2 if temp&etype=="gfpartunset"
replace deter=1 if temp&etype=="gfpartunset"
replace deter=0 if temp&etype=="grunoff"
drop temp

*CASE SELECTION
*identify cases that are missing important variables as cases not to use.
*dontuse=1 means that the election shouldn't be used for an analysis of the determinants of vote share, although it might be appropriate to use for other purposes, such as tabulating winners for a party by chamber, etc.  
*The following are held at irregular times, but should be kept for lagging vote share.  
gen lagonlyuse=etype=="ssg"
gen dontuse=0
foreach string in incompleteelect dist party etype eseats generalproblem outcome year writeinstatus {
gen temp=regexm(uncert,"`string'")
replace dontuse=1 if temp==1
drop temp
}
*FL 2014 HS
*The outcome of the following election was thrown out.  But the person who received the most votes in the general election in question also won the special election that was called later to fill the seat, so the special election can be changed to deter=1 and the initial election can be kept.  
drop if sid==9&sen==0&dno==64&year==2015
replace deter=1 if sid==9&sen==0&dno==64&year==2014&etype=="g"
*DONTUSE
gen general=etype=="g"|etype=="gs"
replace dontuse=1 if general==0
*KEEP
keep if deter==1|general==1
*drop cases that are in non-partisan election state-years.  Drop louisiana's cases, because of the jungle primary there.  
drop if sid==27|(sid==23&year<1974)|sid==18
*drop odd-year election states
drop if (sid==17&year<1981)|(sid==17&year==1981&sen==0)|sid==18|sid==24|sid==30|sid==46
*but keep KY 1983 cases and KY Sen 1981 cases for lagging.  
replace lagonlyuse=1 if sid==17&year<1984
*drop IL HS cases before 1980 when they had cumulative voting
drop if dtype==10
*drop cases where a political party has more than 90% of the seats in both chambers of the state legislature going into the election in question.  The sessyear in the partisan balance file is therefore matched with the same election year.  This is mostly in the "solid South".  Also fill in gaps so that they are continuous stretches of time in the same state.  Keep the latter part of these state-years for each state in the dataset so that lagging can be done appropriately.
replace lagonlyuse=1 if sid==1&year<1985
drop if sid==1&year<1981
replace lagonlyuse=1 if sid==4&year<1992
drop if sid==4&year<1988
replace lagonlyuse=1 if sid==10&year<1982
drop if sid==10&year<1980
*I took 1992 out of the following because it's redistricting anyway.  
replace lagonlyuse=1 if sid==11&year>1991
*id should have 2001 dropped by this method, but I left it in because it is an isolated case.  Also, technically, 2002 is the year that should be examined, and that doesn't have more than 90% Republican.  
replace lagonlyuse=1 if sid==18&year<1987
drop if sid==18&year<1983
replace lagonlyuse=1 if sid==24&year<1990
drop if sid==24&year<1987
replace lagonlyuse=1 if sid==33&year<1979
drop if sid==33&year<1975
*because the years in rhode island are sporadic, I'm leaving them in.  
replace lagonlyuse=1 if sid==40&year<1981
drop if sid==40&year<1977
replace lagonlyuse=1 if sid==43&year<1978
drop if sid==43&year<1974
*because the years in west virginia that should be excluded w this criteria are sporadic, I'm leaving them in.  
*I judged the following person wasn't actually on the ballot, although they were reported as having 0 votes in the returns.  
drop if year==2002&sid==8&sen==0&dno==9&partyz=="d"

*WRITEINS
*Get rid of scattering of all vote amounts as well as write-ins identified by name who receive less than 5% of the vote
drop if cand=="scattering"|cand=="writein"
*Clear up problems associated with probable writeins who weren't coded as such in SLERs (haven't been dealt with yet).  First I identify those getting fewer than 10 votes, than those getting between 11 and 20 votes just to see how many there are of each.  Not all candidates with those codes for v20 are write-ins, the number of votes they get is part of the evidence that they’re a write-in.  
gen temp=1 if party=="99993"&vote<10
replace temp=1 if party=="99994"&vote<10
replace temp=1 if party=="99997"&vote<10
replace temp=1 if party=="99998"&vote<10
replace temp=1 if party=="99993"&vote<20
replace temp=1 if party=="99994"&vote<20
replace temp=1 if party=="99997"&vote<20
replace temp=1 if party=="99998"&vote<20
egen sum1=sum(vote), by(year sid sen dname dno geopost mmdpost etype)
egen sum2=sum(vote), by(year sid sen dname dno geopost mmdpost etype candid)
gen tempvoteper=(sum2/sum1)*100
replace temp=1 if party=="99993"&tempvoteper<2
replace temp=1 if party=="99994"&tempvoteper<2
replace temp=1 if party=="99997"&tempvoteper<2
replace temp=1 if party=="99998"&tempvoteper<2
replace temp=1 if tempvoteper<1&vote<50
replace temp=1 if caseid==142300
replace temp=1 if caseid==142487
replace partyz="writein" if temp==1
replace partyt="writein" if temp==1
drop temp
*The following drops write-in candidates who received less than 5% of the total vote.  
drop if partyz=="writein"&tempvoteper<5&partyt!="d"&partyt!="r"
drop if partyt=="writein"&vote==0
drop sum1 sum2 tempvoteper
*tab partyz
*tab partyz partyt

*VOTEMISS
gen votemiss=vote==.

*FIRSTCASE
recode firstcase (.=0) (1/2=1)
replace firstcase=1 if sid==23&(year==1974|(year==1976&sen==1&dno!=47&dno!=64))

*UOA CAND
collapse (mean) votemiss (sum) vote (max) dontuse firstcase, by(year sid sen dname dno geopost mmdpost specpost cand candid partyt termz outcome exper tenure1 tenure2 dtype etype eseats regime redist redist1 redist2 redist3 nest nest1 nest2 nest3 deter general lagonlyuse)

*VOTEMISS
*verify that votemiss is either 0 or 1, and not in between.
assert votemiss==0|votemiss==1
*that is correct
*replace vote=0 with system missing to deal with Stata's problem with this.  
replace vote=. if votemiss==1

*APPEND
*Append file with 2018 candidates.  
append using $cases2018
replace etype="g" if year==2018&merge1==3
replace etype="holdover" if year==2018&merge1==2
replace lagonlyuse=0 if year==2018
replace general=1 if year==2018&merge1==3
replace general=0 if year==2018&merge1==2
replace votemiss=1 if year==2018
replace dontuse=0 if year==2018
replace cand=last+", "+first if year==2018
replace cand=cand+" "+middle if year==2018&middle!=""
replace cand=cand+" "+suffix if suffix!=""
drop last first middle suffix

*IDENTIFICATION
*verify that each candidate is only observed once per election.  The exception will be cases where candid is blank, for 2018 cands who aren't in SLERs.  
bysort year sid sen dname dno geopost mmdpost etype cand candid: assert _N==1
*all 1, good

*PARTYWEIRD
*Create a variable tracking party switches since the last time they ran, as long as it was within four years, and from what party to what.  
*don't consider it a party switch if they were a write-in or non-major party candidate in the past.  Just because someone writes in a Democrat who ran last time doesn't mean there's been a party switch.  And if someone is running as a dem or repub after being a third party candidate, that means little.
gen partyt2=partyt
tab vote if partyt=="writein"
*most of those obtained large numbers of vote, and the ave is about 2k.  
*The following is because it doesn't matter if someone went from nonmaj to partymissing.  If they go from dem to partymissing, that would still register.  
replace partyt2="nonmaj" if partyt=="nonpart"|partyt=="partymissing"
bysort candid (year): gen id=1 if _n==1
replace id=sum(id)
*fill in id for candid=.
sum id
*max=117585
gen c=1
replace c=122000 in 1
replace c=sum(c)
replace id=c if candid==.
drop c
bysort id (year): gen row=_n
tsset id row
by id: gen lag=partyt2[_n-1]
by id: gen yearlag=year[_n-1]
gen yeardif=year-yearlag
tab partyt2 lag if yeardif<5&year==2018
*"switch" tracks whether there has been a d to r or r to d switch.
gen dswitch=partyt2=="d"&lag=="r"&yeardif<5
gen rswitch=partyt2=="r"&lag=="d"&yeardif<5
*"stealth" tracks whether a non-major party cand was a d or r in the recent past, even if they're a writein (only writeins with a lot of votes are left in at this point).  
gen dstealth=partyt2=="nonmaj"&lag=="d"&yeardif<5
gen rstealth=partyt2=="nonmaj"&lag=="r"&yeardif<5
by id: gen outcomelag=outcome[_n-1]
gen dswitchwin=dswitch==1&outcomelag=="w"
gen rswitchwin=rswitch==1&outcomelag=="w"
gen dstealthwin=dstealth==1&outcomelag=="w"
gen rstealthwin=rstealth==1&outcomelag=="w"
*Look for non-major party winners last time and code them for 2018 only.  There aren't any.  This may be because a lot of independents aren't in my records.  There are 411 of them.  Only 113 of them have candid values, but the winners should have them.  Maybe I wasn't consistent in matching non-major party candidates to slers.  Return to this later.  
gen partyweird=""
foreach string in dswitch rswitch dstealth rstealth dswitchwin rswitchwin dstealthwin rstealthwin {
replace partyweird="`string'" if `string'==1
}
save $mainfile, replace

*MINI-OUTPUT FILE
merge m:1 sid using 009_StateCodes
drop if _merge==2
keep if partyweird!=""&year==2018
keep year state sen dname dno geopost mmdpost cand partyweird partyt2 lag sid
order year state sen dname dno geopost mmdpost cand partyweird partyt2 lag sid
rename lag pastparty
rename partyt2 currentparty
export delimited 109partyweird$datezzz.csv, replace
clear


clear
use $mainfile
drop partyt2 id row lag yearlag yeardif outcomelag

*KEYVARS
*create variables tracking cands, wins, incs, inc2s, inc3s, vote and prior legislative experience.
rename cand candname
gen cand=1
gen win=outcome=="w"
gen inc=exper=="inc"
recode tenure1 (0/3=0) (4/7=1) (8/max=0), gen(inc2)
recode tenure1 (0/7=0) (8/max=1), gen(inc3)
recode tenure2 (0/3=0) (4/7=1) (8/max=0), gen(leg2)
recode tenure2 (0/7=0) (8/max=1), gen(leg3)
gen other=exper=="other"
gen past=exper=="pastinc"|exper=="pastother"|exper=="pastboth"
*pull out # cands, vote and wins by party
foreach string in cand vote win inc inc2 inc3 other leg2 leg3 past {
gen d`string'=`string' if partyt=="d"
gen r`string'=`string' if partyt=="r"
gen o`string'=`string' if partyt!="d"&partyt!="r"
}
drop cand exper inc inc2 inc3 other leg2 leg3 past

*TERMZ
replace termz=1 if termz==1.5
*create weights for termz
bysort year sid sen dname dno geopost mmdpost etype: gen sum1=_N
bysort year sid sen dname dno geopost mmdpost etype termz: gen sum2=_N
gen dif=sum1-sum2
egen max=max(termz), by(year sid sen dname dno geopost mmdpost etype)
*the following is non-zero for just a few elections
gen minweight=win==1&termz!=max
*the following is non-zero for all elections.  
gen maxweight=win==1&(dif==0|(termz==max))
drop sum1 sum2 dif win max

*VOTEMISS
egen mean=mean(votemiss), by(year sid sen dname dno geopost mmdpost dtype eseats etype redist redist1 redist2 redist3 nest nest1 nest2 nest3)
list year sid sen dname dno geopost mmdpost partyt votemiss vote outcome if mean!=0&mean!=1
*There are five cases from two such elections.  I looked up one in the original source and the returns had a missing vote.  There is no Dem, so it's just a Repub uncont election, so if the non-major party candidate with no votes is dropped, there won't be a problem.  The Republican won that election.  
drop if mean!=0&mean!=1&votemiss==1&partyt=="nonmaj"
*1 obs dropped, perfect
drop mean
*taking the max of votemiss by election will ensure that the other election isn't used.  

*TRACKCANDS
foreach party in d r {
egen `party'candsum=sum(`party'cand), by(year sid sen dname dno geopost mmdpost)
foreach string in candid candname tenure1 tenure2 {
gen `party'`string'=`string' if (partyt=="`party'"&eseats==1&`party'candsum<2)|(partyt=="`party'"&eseats==2&`party'candsum==2&sid==3&sen==0)
}
drop `party'candsum
}
save $mainfile, replace


*2018 CAND INFO
*Put the following variables into a separate file so they can be merged with the district level output file for etype=1 elections.  
clear
use $mainfile
keep if year==2018&eseats<3&partyt!=""&partyt!="nonmaj"
drop if eseats==2&sid!=3
drop if dcandname==""&rcandname==""
keep dcandname rcandname dtenure1 rtenure1 dtenure2 rtenure2 sid sen dname dno geopost mmdpost partyt
bysort sid sen dname dno geopost mmdpost partyt: gen row=_n
tab row
rename dcandname candname1
rename rcandname candname2
rename dtenure1 tenure11
rename rtenure1 tenure12
rename dtenure2 tenure21
rename rtenure2 tenure22
*This has to be reshaped long before it can be reshaped wide.  
reshape long candname tenure1 tenure2, i(sid sen dname dno geopost mmdpost partyt row) j(partynum)
gen row2=1 if partyt=="d"&row==1
replace row2=2 if partyt=="r"&row==1
replace row2=3 if partyt=="d"&row==2
replace row2=4 if partyt=="r"&row==2
drop if candname==""&tenure1==.&tenure2==.
list if tenure1==.
*these merely have a coma for candname.
drop if tenure1==.
*now fully observed.  
drop partyt row partynum
reshape wide candname tenure1 tenure2, i(sid sen dname dno geopost mmdpost) j(row2)
foreach string in candname tenure1 tenure2 {
rename `string'1 d`string'a
rename `string'2 r`string'a
rename `string'3 d`string'b
rename `string'4 r`string'b
}
save tempcandattributes, replace

*MAINFILE
clear
use $mainfile
drop candid candname tenure1 tenure2 dcandname rcandname dtenure1 rtenure1 dtenure2 rtenure2

*Any 2018 values that have sysmis that are summed in the following will be inappropriately turned to zero.  Make sure this won't happen.  
local varlist dvote rvote ovote dcand rcand ocand dwin rwin owin dinc rinc oinc dinc2 rinc2 oinc2 dinc3 rinc3 oinc3 dother rother oother dleg2 rleg2 oleg2 dleg3 rleg3 oleg3 dpast rpast opast dswitch dswitchwin rswitch rswitchwin dstealth rstealth dstealthwin rstealthwin maxweight minweight

*UOA ELECT
collapse (sum) `varlist' (mean) dheld rheld oheld (max) votemiss dontuse maxtermz=termz dcandid rcandid firstcase (min) mintermz=termz, by(year sid sen dname dno geopost mmdpost dtype eseats etype regime redist redist1 redist2 redist3 nest nest1 nest2 nest3 lagonlyuse specpost nextup)

*VOTEMISS
*replace vote=0 with system missing to deal with Stata's problem with this.  
recode dvote rvote ovote (*=.) if votemiss==1

*IDENTIFICATION
bysort year sid sen dname dno geopost mmdpost: assert _N==1

*PROPS
*replace vars with proportions
*The next four are just to make the code work.
gen oswitch=0
gen ostealth=0
gen oswitchwin=0
gen ostealthwin=0
foreach string in cand win inc inc2 inc3 other leg2 leg3 past switch switchwin stealth stealthwin {
foreach party in d r o {
*adjust number of cands when election is over-contested.  This will get top two primary state-years and NV, but that's part of the plan, they are uncontested elections.  
replace `party'`string'=eseats if `party'`string'>eseats&`party'`string'!=.
replace `party'`string'=`party'`string'/eseats
}
}
drop oswitch ostealth oswitchwin ostealthwin
*for one party, inc+oth+past can't be more than 1.  Assume incumbents will beat other who will beat past.  That means if inc>1, adjust inc down to 1.  Then if inc+other>1, subtract other by the amount more than 1 that sum is.  Then, if inc+other(new)+past is >1, subtract past by the amount that sum is greater than 1 by.  
foreach party in d r {
gen temp=`party'inc+`party'other-1
replace temp=0 if temp<0
replace `party'other=`party'other-temp
replace `party'other=0 if `party'other<0
replace temp=`party'inc+`party'other+`party'past-1
replace temp=0 if temp<0
replace `party'past=`party'past-temp
replace `party'past=0 if `party'past<0
drop temp
}

*UNCONT
*partuncont
gen partuncont=((dcand>.01)&(dcand<.99))|((rcand>.01)&(rcand<.99))
*mixeduncont
*These are partially uncontested elections where the total number of major party candidates equals the number of seats to be won and so the party of the winners is known in advance.  
gen mixeduncont=((dcand+rcand)==1)&dcand!=1&rcand!=1
tab mixeduncont
*146 of those
*uncont
gen uncont=rcand==0|dcand==0|mixeduncont==1

*VOTESHARE
gen dper=(dvote/(dvote+rvote))*100 if year!=2018
replace dper=100 if dwin==1&dcand==1&rcand==0&year!=2018
replace dper=0 if rwin==1&dcand==0&rcand==1&year!=2018
*If dper is system missing and the election is fully or partially contested by the major parties, make dontuse=1.  
replace dontuse=1 if dper==.&uncont==0&year!=2018
*make sure that there is no repub cand when dper=1 and no dem cand when rper=0
assert dcand==0 if dper==0&year!=2018
*always 0, good
assert rcand==0 if dper==100&year!=2018
*always 0, good
sum sid if dontuse==0&votemiss==1&dper!=100&dper!=0&uncont==0&year!=2018
local aaa=r(N)
assert `aaa'==0
*always 0, good
replace dper=. if year==2018
*0 changes made for the last line.  

*BIGTHIRD
*Exclude cases with a strong non-major party presence.  
*Scores of "1" indicate a large proportion of third party votes.
*Scores of "2" track elections with 1) no dem & no repub cands, 2) at least one nonmaj winner, 3) at least one nonmaj inc, 4) at least one nonmaj legislator from the other chamber, or 5) at least one nonmaj legislator who served in the past.  
gen bigthird=0
gen oper=ovote/(dvote+rvote+ovote)
replace bigthird=1 if oper>.2&oper!=.&dcand==1&rcand==1
drop oper
replace bigthird=2 if [(dcand==0&rcand==0)|(owin!=0&owin!=.)|(oinc!=0&oinc!=.)|(oother!=0&oother!=.)|(opast!=0&opast!=.)]&year!=2018
replace dontuse=0 if year==2018

*WAVE
merge m:1 year using $wavefile
tab year _merge
*there are still some odd-numbered years in there and they didn't merge.  Since we have no idea how voters respond to national conditions in odd-year elections, change those cases to 0, which is about the mean.  
gen oddyear=mod(year,2)
replace wave=0 if wave==.&oddyear==1
drop _merge oddyear

*STATELEVEL
gen stateno=sid
merge m:1 year stateno using tempgub
drop if _merge==2|(year<1967&_merge==1)|(year==.&_merge==1)
*make sure all cases received a value from the merge.
assert _merge==3
drop stateno _merge

*D MINUS R
*Compute as dif of dem and repub
*Change to system missing when not to be used.
foreach string in cand inc inc2 inc3 other leg2 leg3 past switch switchwin stealth stealthwin {
gen `string'=d`string'-r`string'
}

*Merge file that reports the proportion of seats up in each legislative chamber here.  
merge m:1 year sid sen using tempseatprop
*merge=1 only in 2018, and the special elections in 2015 and 2017, if anything else is observed, there is a problem.  
assert _merge!=1 if year!=2015&year!=2017&year!=2018
drop if _merge==2
drop _merge

save $mainfile, replace



*LAGGED VARS
*Create file with lagged vars
*SPLIT
*split lagged file into two files, one with the max weights, one with the min weights.  
*inc2 and inc3 aren't lagged, since they are almost perfectly collinear with inc and inclag when both of the latter are included in a model.  
clear
use $mainfile
*since 2018 elections haven't been conducted yet, they can't be lagged values.
drop if year==2018

*The parts of specpost that are alpha shouldn't be put into mmdpost in the following.  
destring specpost, force gen(temp)
replace mmdpost=temp if temp!=.
drop temp
keep year sid sen dname dno geopost mmdpost maxweight minweight maxtermz mintermz dper cand inc inc2 inc3 other leg2 leg3 past switch switchwin stealth stealthwin wave govparty smidpen bigthird dontuse propup partuncont mixeduncont uncont redist dcandid rcandid dwin
save tempslerslagged, replace
*LAGFILE1
*Create temp lag file #1, with maxtermz
clear
use tempslerslagged
drop minweight mintermz
rename maxweight weight
rename maxtermz termz
*Break id 1974 house off and give it posts.  Give it posts here first.  
replace mmdpost=1 if sid==12&sen==0&year==1974
save temp, replace
keep if sid==12&sen==0&year==1974
replace mmdpost=2
append using temp
save temp, replace
*LAGFILE2
*Create temp lag file #2, with mintermz
*id 1974 house doesn't have to be messed with with this one
clear
use tempslerslagged
drop maxweight maxtermz
rename minweight weight
rename mintermz termz
drop if weight==0
*APPEND
*Put the two lag files together
append using temp
erase temp.dta
*YEAR
rename year yearlag
gen year=yearlag+termz
*In one situation, a district is up for an election in a year ending in "0", and has a four year term.  Two years later, in a year ending in "2", which is a redistricting year, a district with the same number is up for election, and has a two year term.  One inappropriate modeling strategy would result in the district up in the year ending in "0" contributing to the lagged value of the election up in the year ending in "4."  So when more than one election is nested in an election that is going to have a lagged value, and the later one of those elections (in this case, the one taking place in a year ending in "2") has redist!=0, drop the lagged values of the earlier election.  
*Another situation is similar to the above, but is more problematic.  In this example, a district is up for an election in a year ending in "0" and has a four year term.  Two years later, the map is redrawn.  There is no election for a district with that name/number in a year ending in "2."  Two years after that, there is re-redistricting, and the district in question has a value of redist=2.  I don't see the problem.  If redist=1, then no lagged value will arrive there.  If redist=2, it should be in terms of the map that was in place the last time an election in that locale was conducted, which, by the definition of this example, it wasn't in a year ending in "2."  What if the term for the prior district in question was only two years, but it was four years ago when it was up?  Is it possible for a situation to be that unfair/messed up?  Anything is possible.  
bysort year sid sen dname dno geopost mmdpost (yearlag): gen row=_n
tab row
*There are either values of 1 or 2.  There are 39 cases of row=2.  
gen temp=row==2&redist==1
tab temp
*there are 30 cases of temp=1
tab sid sen if temp==1
egen redistproblem=max(temp), by(year sid sen dname dno geopost mmdpost)
tab year sid if redistproblem==1
*The only way this will be messed up is if there's re-redistricting in a year ending in 4, as there was in AK in 2014.  
*states from above
*except for half the cases in nd, all the below are from state senates.  
*ak 76 86 14
*co 84
*hi 84 94
*ia 72 84 94 04 14
*mt 96
*nd 94 04
*or 04
*ut 94
drop if row==1&redistproblem==1
*COLLAPSE
*collapse is necessary as some district-years are now observed twice.  
gen c=1
collapse (sum) c (mean) dper cand inc inc2 inc3 other leg2 leg3 past switch switchwin stealth stealthwin wave govparty smidpen propup partuncont mixeduncont uncont rcandid dcandid dwin (min) lagyearmin=yearlag (max) lagyearmax=yearlag bigthird dontuse redistproblem [fweight=weight], by(year sid sen dname dno geopost mmdpost)
*turn candids to system missing if there is more than one.  
replace dcandid=. if c>1
replace rcandid=. if c>1
*NOTE FROM LATER: why did I do the following?  NOTE FROM EVEN LATER: TO GET IT READY FOR THE REPEAT CONTEST, ETC. VARS.
replace dwin=. if c>1
*RENAME
*Rename match vars
rename dname dnamemerge
rename dno dnomerge
rename geopost geopostmerge
*rename vars with "lag" as the suffix
foreach string in dper cand inc inc2 inc3 other leg2 leg3 past switch switchwin stealth stealthwin wave govparty smidpen propup bigthird dontuse partuncont mixeduncont uncont dcandid rcandid dwin {
rename `string' `string'lag
}
drop c
save tempslerslagged, replace

*MAIN FILE
*MERGE
*Merge in lagged variables
*First, alter vars that will be merged on when redist=2.  
clear
use $mainfile
gen dnamemerge=dname
replace dnamemerge=redist1 if redist==2|redist==4|redist==7|redist==9
gen dnomerge=dno
replace dnomerge=redist2 if redist==2|redist==4|redist==7|redist==9
gen geopostmerge=geopost
replace geopostmerge=redist3 if redist==2|redist==4|redist==7|redist==9
*a many to 1 merge must be done because when redist=2, a district designation appears twice.  This won't hurt anything if everything is zeroed out that is redist=1.  The past value is being put into two districts, but since redist=1 for one, the inappropriately matched one will be changed to system missing.  
merge m:1 year sid sen dnamemerge dnomerge geopostmerge mmdpost using tempslerslagged
erase tempslerslagged.dta
tab year _merge
*For earlier years, the merge=2 cases cluster in redistricting years, as expected.
*nc sen #29 shouldn't merge, as it has no where to go (33 in 2016 became 29 in 2018).  That should be the only one.  
assert _merge!=2 if year==2018&(!(sid==33&sen==1&dnomerge==29))
assert nextup==2020 if year==2018&_merge==1




*For merge=1, those are also clustered in redistricting years, as expected.  
drop if _merge==2
drop _merge

*YEARLAG
gen tempdif=lagyearmax-lagyearmin
tab tempdif redistproblem
*redistproblem is always associated with tempdif=0, good.
assert tempdif==0 if redistproblem==1
tab redist if redistproblem==1
*redist=0 for 21 cases, redist=1 for 6 cases, redist=4 for 1 case, and redist=8 for 1 case.  Redist=0 is no problem.  The redist=1 cases are no problem, they will simply be changed to system missing like all the other lagged values in that circumstance.  redist=4 is a combo of redist=2 and redist=3, and I don't see any problem with that.  redist=8 isn't a problem, the earlier value of the pair has been dropped.  
*There are 6 cases of tempdif=2.
sort sid sen dname dno geopost mmdpost year
list sid sen dname dno geopost mmdpost year tempdif if tempdif>0&tempdif!=.
*None of those are problematic, they are different streams coming together as they should.  
drop tempdif

*LAGS, REDIST & MISS
*Make vars blank if redistricting occurred, unless the lagged var doesn't vary within a state-year and is from a prior year where all seats were up.  
foreach string in dperlag candlag inclag inc2lag inc3lag otherlag leg2lag leg3lag pastlag switchlag stealthlag switchwinlag stealthwinlag partuncontlag mixeduncontlag uncontlag dcandidlag rcandidlag {
replace `string'=. if redist==1|redist==6|redist==8
}
*wavelag, govparty & smidpen
*This often doesn't have to be changed to system missing if there is redist=1.  If all seats were up the last time elections for the chamber were held, it doesn't have to be changed to system missing.  
egen propuplagmin=min(propuplag), by(sid year sen)
*The following isn't the last word on maximizing how many of these cases are observed.  More will be done later to fill these in.
replace wavelag=. if (redist==1|redist==6|redist==8)&propuplagmin<1
*2903 to missing
replace govparty=. if (redist==1|redist==6|redist==8)&propuplagmin<1
*3298 to missing
replace smidpen=. if (redist==1|redist==6|redist==8)&propuplagmin<1
*3298 to missing

*NC LAGGED VARS FOR 2018

*NC 2018 MISS
mdesc if sid==33&year==2018&redist==0
*Except for "contest" everything is fully observed that should be.  
mdesc if sid==33&year==2018&redist==2
*Except for "contest" everything is fully observed that should be.  It's hard to tell for dcandid and rcandid, but a lot of those would be missing, and many cases are observed for them.  
save $mainfile, replace

*MERGEPREP
*For the cases being merged, a 1:1 merge could be done, but not for other cases, so a m:1 merge must be done.  
*Add nec vars to NC lagged file.  
clear
use 012_103estnclag20180819
drop redistbrian redist2brian
gen dname=""
gen geopost=.
gen mmdpost=.

*MERGE
*Merge nc 2018 lagged values with main file.  
merge 1:1 year sid sen dname dno geopost mmdpost using $mainfile

*NC 2018
*Move values for NC from the new vars to the main file vars and then delete the new vars.  
rename dper0lage dperlage
foreach string in dperlag candlag inclag inc2lag inc3lag leg2lag leg3lag otherlag pastlag {
replace `string'=`string'e if _merge==3&redist==1&sid==33&year==2018
drop `string'e
}
*The first four in the following list are always 0 in NC 2016, so a simple "0" will suffice for them instead of bringing them in.  The last two variables cannot take on values other than "0" in NC, because they only occur in FFA-MMDs.  
recode switchlag switchwinlag stealthlag stealthwinlag partuncontlag mixeduncontlag (*=0) if year==2018&sid==33
*CHECK
assert nodemlage==1 if candlag==-1&sid==33&redist!=1&year==2018
assert nodemlage==0 if (candlag==0|candlag==1)&sid==33&redist!=1&year==2018
assert norepublage==1 if candlag==1&sid==33&redist!=1&year==2018
assert norepublage==0 if (candlag==0|candlag==-1)&sid==33&redist!=1&year==2018
*That all looks good
gen temp=nodemlage+norepublage
assert temp>=0&temp<=1 if sid==33&year==2018
*good, ranges from 0 to 1 only.  
replace uncontlag=temp if sid==33&year==2018
drop nodemlage norepublage temp _merge

save $mainfile, replace

*UOASTATEYEARLAGS
*National and state variables can also often be filled in when a district did not receive a match with a case from the past.  
*There is a way more could be filled in, but I'm not doing that right now.  If a section of a state was all up at the same time last time, then the lagged var could be filled in, even if other parts of the state weren't up.
clear
use $mainfile
keep year sid sen mintermz maxtermz
merge m:1 year sid sen using tempseatprop
erase tempseatprop.dta
keep if _merge==3&propup==1
drop _merge propup
gen temp=mintermz!=maxtermz
tab temp
*only 7 cases of that, good.
list year sid sen if temp==1
assert sid==11 if temp==1
*all HI, drop them
egen max=max(temp), by(year sid sen)
drop if max==1
drop temp max
drop mintermz
rename maxtermz termz
*If seats have different term lengths, only keep the cases with the shortest term lengths.  
egen min=min(termz), by(year sid sen)
keep if min==termz
*national wave
merge m:1 year using $wavefile
keep if _merge==3
drop _merge
rename wave wavelag2
*state conditions
gen stateno=sid
merge m:1 year stateno using tempgub
erase tempgub.dta
keep if _merge==3
rename govparty govpartylag2
rename smidpen smidpenlag2
replace year=year+termz
gen c=1
collapse (sum) c, by(year sid sen wavelag2 govpartylag2 smidpenlag2)
drop c
bysort year sid sen: gen temp=_N
assert temp==1
*all 1, good
drop temp
save temp, replace
*MAIN FILE
*MERGE
clear
use $mainfile
merge m:1 year sid sen using temp
erase temp.dta
drop if _merge==2
replace wavelag=wavelag2 if wavelag==.
replace govpartylag=govpartylag2 if govpartylag==.
replace smidpenlag=smidpenlag2 if smidpenlag==.
*2536 changes made for all three of the above
drop _merge wavelag2 govpartylag2 smidpenlag2

*LAGONLYUSE
*get rid of cases that were only being kept so that they could provide lagged values. 
*get rid of lagonlyuse=1 now that lagged vars have been brought in
drop if lagonlyuse==1
drop lagonlyuse

save $mainfile, replace


clear
use $mainfile

*SWITCH
*There are some errors in switch.  
*Problem #1: there are a few cases where switch=0 but switchwin!=0.  switch should always be non-0 as long as switchwin isn't 0.  
tab switchwin if switch==0
*There are three such cases, -1, .3333, and 1.  However, they may actually be instances of d & r switchers canceling out while only one (by definition sometimes) won last time.
list dcandid rcandid dswitch rswitch dswitchwin rswitchwin if switchwin!=.&switchwin!=0&switch==0
*That is in fact why, so this actually isn't a problem.  

*Problem #2: there are some examples of eseats=1 elections where I compare dcandid, rcandid, dcandidlag and rcandidlag.  They clearly show party switches, yet they aren't coded as such in switch.  
egen tempdswitch=diff(dcandid rcandidlag)
egen temprswitch=diff(rcandid dcandidlag)
replace tempdswitch=1-tempdswitch
replace temprswitch=1-temprswitch
replace tempdswitch=. if dcandid==.|rcandid==.
replace temprswitch=. if dcandid==.|rcandid==.
gen tempdswitchwin=tempdswitch==1&dwinlag==1
gen temprswitchwin=temprswitch==1&dwinlag==0
tab switch tempdswitch if eseats==1&uncontlag==0
*There are 46 cases where the new method said there wasn't a switch, but there was a switch.  That could be because of the four year lag that is allowed when switch was computed.  
tab switchwin tempdswitchwin if eseats==1&uncontlag==0
*I don't have time to figure this out now.
*COME BACK TO THIS.
drop tempdswitch temprswitch tempdswitchwin temprswitchwin

*CONTESTVARS
*REPEATCONTEST-REPEATWINNER-REPEATLOSER
*Find elections where all the same Democratic and Republican candidates faced each other since the last election.
*Find elections where the loser in the last election is running again, but the winner has stepped down.  
*Interact vote share with incumbency, except when a repeat contest is occurring.  
*Only do this for elections to a single seat.  
*By definition, these wouldn't have been uncontested in the past.  But incumbency status may have changed if it was an open seat in the first round.  
*This could be redone.  If the dem and repub cands have never faced off before, then you would know with certainty that it hasn't happened before, even if there was redistricting.  Or is this fundamentally different when there is redist?
egen ddif=diff(dcandid dcandidlag)
egen rdif=diff(rcandid rcandidlag)
gen allthere=dcandid!=.&dcandidlag!=.&rcandid!=.&rcandidlag!=.&eseats==1
replace allthere=1 if year==2018&nextup==2018&eseats==1
*CONTEST
gen contest="repeatcontestdwin" if ddif==0&rdif==0&dwinlag==1&allthere==1
replace contest="repeatcontestrwin" if ddif==0&rdif==0&dwinlag==0&allthere==1
replace contest="repeatdwinner" if ddif==0&rdif==1&dwinlag==1&allthere==1
replace contest="repeatrwinner" if ddif==1&rdif==0&dwinlag==0&allthere==1
replace contest="repeatdloser" if ddif==1&rdif==0&dwinlag==1&allthere==1
replace contest="repeatrloser" if ddif==0&rdif==1&dwinlag==0&allthere==1
replace contest="newcandsdwinner" if ddif==1&rdif==1&dwinlag==1&allthere==1
replace contest="newcandsrwinner" if ddif==1&rdif==1&dwinlag==0&allthere==1
replace contest="ffammd" if eseats!=1&eseats!=.
replace contest="uncontlasttime" if uncontlag==1&uncont==0
replace contest="uncontthistime" if uncontlag==0&uncont==1
replace contest="uncontbothtimes" if uncontlag==1&uncont==1
replace contest="partuncontlasttime" if (partuncontlag>.25&partuncontlag<1.25)&eseats!=1&eseats!=.
replace contest="redist" if redist==1|redist==6
replace contest="altholdoverredist" if redist==8
replace contest="dontuselag" if dontuselag==1
replace contest="dontuse" if dontuse==1
replace contest="bigthirdlag1" if bigthirdlag==1&year<2018
replace contest="bigthirdlag2" if bigthirdlag==2
replace contest="bigthird1" if bigthird==1
replace contest="bigthird2" if bigthird==2
replace contest="speciallagged" if (sid==23&sen==0&dno==32&geopost==2&year==2018)|(sid==38&sen==1&dno==18&year==2018)
*If this is the first time a district-post has been observed in the dataset, make it firstcase=1.  
egen minyear=min(year), by(sid sen dname dno geopost mmdpost)
*LOOK INTO LATER: THE FOLLOWING MIGHT NOT BE CORRECT IF THERE WAS REDISTRICTING.  
replace contest="firstcase" if year==minyear
replace contest="firstcase" if firstcase==1
*25 changes made from the last.  
replace firstcase=1 if year==minyear
*3601 changes made from the last.  Those were expected, as since some years have been excluded because they were one party states, what constitutes a "first case" in slers has changed from the official dataset.  
drop minyear
*The contest variables will be changed to system missing under the following conditions.  
gen unsuitable=redist==1|redist==6|redist==8|dontuselag==1|dontuse==1|bigthirdlag==1|bigthirdlag==2|bigthird==1|bigthird==2|contest=="speciallagged"|firstcase==1

*CONTESTVARS
gen rcontestb=contest=="repeatcontestdwin"
replace rcontestb=-1 if contest=="repeatcontestrwin"
gen rwinnerb=contest=="repeatdwinner"
replace rwinnerb=-1 if contest=="repeatrwinner"
gen rloserb=contest=="repeatdloser"
replace rloserb=-1 if contest=="repeatrloser"
recode rcontestb rwinnerb rloserb (-1=1), gen(rcontest rwinner rloser)
gen rcontestdperlag=rcontest*dperlag
gen rwinnerdperlag=rwinner*dperlag
gen rloserdperlag=rloser*dperlag
gen rcontestinclag=rcontest*inclag
gen rwinnerinclag=rwinner*inclag
gen rloserinclag=rloser*inclag
recode rcontest rwinner rloser rcontestb rwinnerb rloserb rcontestdperlag rwinnerdperlag rloserdperlag rcontestinclag rwinnerinclag rloserinclag (*=.) if year<2018&unsuitable==1
*examine the values of the variables
foreach string in contest rcontest rwinner rloser rcontestb rwinnerb rloserb rcontestinclag rwinnerinclag rloserinclag {
di "`string'"
tab `string' if eseats==1&redist!=1
}
*The last three vars are observed less often than the first vars listed (ignoring contest).  Why is this?  It's because inclag isn't observed.  But those should only be unobserved when conditions would cause the above to not be observed also.  No, not for firstcases.  
tab year if rcontestinclag==.&rcontest!=.&year!=2018
*10 cases are missing values.  Are these eseats>1?
tab eseats if rcontestinclag==.&rcontest!=.&year!=2018
*4 of 10 are, but 6 are eseats=1.  
sum rcontest rwinner rloser rcontestb rwinnerb rloserb rcontestinclag rwinnerinclag rloserinclag rcontestdperlag rwinnerdperlag rloserdperlag if rcontestinclag==.&rcontest!=.&year!=2018
*the last six are never observed, the others are always observed (i.e., 10 times).  
sum sid rcontest rwinner rloser rcontestb rwinnerb rloserb rcontestinclag rwinnerinclag rloserinclag rcontestdperlag rwinnerdperlag rloserdperlag if eseats!=1&year!=2018&eseats!=.
*when those are observed, they are always 0.  They aren't always observed under those conditions, though.  
*They should always=0 when eseats!=1, so do that.
recode rcontest rwinner rloser rcontestb rwinnerb rloserb rcontestinclag rwinnerinclag rloserinclag rcontestdperlag rwinnerdperlag rloserdperlag (*=0) if eseats!=1&eseats!=.
*why would lagged incumbency or lagged vote share be missing when some type of contest has been tracked?
tab redist if rcontestinclag==.&rcontest!=.&year!=2018
*3 are redist=2.  3 are redist=0.
sort sid sen dname dno geopost mmdpost year
list sid sen dname dno geopost mmdpost year dtype if rcontestinclag==.&rcontest!=.&year!=2018&redist==0
*3 out of 3 are weirdness with the md east shore districts.  
list sid sen dname dno geopost mmdpost year if rcontestinclag==.&rcontest!=.&year!=2018&redist!=0
*The above are the redist=2 cases.  So let's say a district didn't change.  But because of redistricting across the state, one of the cands did run in another district.  I'm not sure what's driving this, but there are only three cases, ignore for now, I don't have time to do this.  
replace unsuitable=1 if rcontestinclag==.&year!=2018&rcontest!=.
*6 changes, good
recode rcontest rwinner rloser rcontestb rwinnerb rloserb (*=.) if rcontestinclag==.&year!=2018
*6 changes each, good

*CHECK
tab inc rwinnerb if eseats==1&rcontestb==0&unsuitable==0&uncontlag==0&uncont==0&year!=2018
*How is it possible that, in a single member contest, that wasn't precided by redistricting or an uncontested election, and it isn't a repeat contest, and it isn't a repeat winner contest, that there is an incumbent?  There are 134 such cases above.  
gen temp=1 if rwinnerb==0&(inc==-1|inc==1)&eseats==1&rcontestb==0&unsuitable==0&uncontlag==0&uncont==0&year!=2018
*Get rid of instances of party switchers who are prior winners.  Incumbency would be coded -1 or 1, but the dem cand wouldn't be the same as last time, same for repubs.  
replace temp=. if switch==-1|switch==1
*That got 105
tab sid sen if temp==1
*There are 29 such elections.  These are spread across chambers and states.  They don't seem to be concentrated in chambers that have posts.  
tab year sen if temp==1
*they are spread across time.  
sort sid sen dname dno geopost mmdpost year
list sid sen dname dno geopost mmdpost year inc rwinnerb redist ddif dcandid dcandidlag rdif rcandid rcandidlag allthere dwinlag if temp==1
*If the code has been done correctly (which I believe it has) both the Dem cand and the Repub cand have changed from the last election.  One explanation is that these are incumbents who came from a different district or post.  So there isn't a repeat winner, but there is an incumbent from another district.  Test this theory.
*Dem incs
gen temp2=1 if temp==1&inc==1
tab temp2
*there are 16 of those
egen max=max(temp2), by(dcandid)
local varlist dcandid year temp sid sen dname dno geopost mmdpost inc dwinlag
sort `varlist'
list `varlist' if max==1&dcandid!=.
*6 out of 16 do involve a district or post switch.  
*post switch
replace temp=. if dcandid==98255
*1 change, good
replace temp=2 if (dcandid==31507|dcandid==142346|dcandid==200811|dcandid==214317|dcandid==314749)&temp==1
*5 changes, good
drop temp2 max
*Repub incs
gen temp2=1 if temp==1&inc==-1
tab temp2
*there are 13 of those
egen max=max(temp2), by(rcandid)
local varlist rcandid year temp sid sen dname dno geopost mmdpost inc dwinlag
sort `varlist'
list `varlist' if max==1&rcandid!=.
*10 out of 13 do involve a district or post switch.  
*post switch
replace temp=2 if (rcandid==148223|rcandid==200923|rcandid==201707|rcandid==240674|rcandid==284338|rcandid==296685|rcandid==301453|rcandid==314666|rcandid==329838|rcandid==330000)&temp==1
*10 changes, good
drop temp2 max
*That still leaves 13
tab redist if temp==1
*one of those is redist=4, so the number of seats in the district changed.
replace temp=. if temp==1&redist==4
list if temp==1
*DON'T HAVE TIME TO GET THE LAST 12, COME BACK TO THIS.
*If those are incumbents from an entirely different district, should they really be coded as incumbents?  A demoted incumbent, that only takes their strategic politician factor into account?  The problem is the N isn't high enough to compute this.  Ignore this problem for now.  
drop temp
gen temp=1 if inc==0&(rwinnerb==-1|rwinnerb==1)&eseats==1&rcontestb!=-1&rcontestb!=1&unsuitable==0&uncontlag==0&uncont==0&year!=2018
*a bunch are id hs 1976 as expected, no problem.  They should be that way.  
replace temp=. if sid==12&sen==0&year==1976
tab temp
drop temp

drop ddif dcandid dcandidlag rdif rcandid rcandidlag allthere dwinlag

save $mainfile, replace

*CHECK
*The following uses SLERs to check into the discrepancies uncovered above.  
*many of the rest should be caused by dif numbers of seats up between time t-1 and time t, with time t only having one seat up.  
*That leaves 7 that have a winner from last time (according to rwinnerb) running who isn't an incumbent.  
clear
use $slersfile
sort sid sen year dname dno geopost mmdpost partyt
*ak ? sen E
list year cand partyt exper outcome termz if deter==1&sid==2&sen==1&dname=="e"&year>1971&year<1977
*that is an instance of different numbers of seats up between time t-1 and time t.  
*ar 1996 hs #12
list year mmdpost cand partyt exper outcome termz dtype eseats if deter==1&sid==4&sen==0&dno==12&year>1993&year<1997
*No, there were incumbents, but one was a repub and one was a dem.  I remember this now.  One switched a post, so even though there wasn't redistricting, two incumbents opposed each other.  
list year mmdpost cand partyt exper outcome termz dtype eseats redist if deter==1&sid==7&sen==0&dno==143&year>1967&year<1971
*ct 1970 hs #143
list year dno mmdpost cand partyt exper outcome termz dtype eseats redist if deter==1&sid==7&sen==0&cand=="green, edwin r."&year>1967&year<1971
*caused by two incs against each other, but no redist, and one didn't appear in the other district.  District switcher?  Yes, switched districts.  
*il 2004 hs #20
list year dno mmdpost cand partyt exper outcome termz dtype eseats redist if deter==1&sid==13&sen==0&year>2001&year<2005&dno==20
*also two incs, no redist, and one from a dif district.
*il 1984 hs #105
list year dno mmdpost cand partyt exper outcome termz dtype eseats redist if deter==1&sid==13&sen==0&year>1981&year<1985&dno==105
*also two incs, no redist, and one from a dif district.
*mt 2012 hs #63
list year dno mmdpost cand partyt exper outcome termz dtype eseats redist if deter==1&sid==26&sen==0&year>2009&year<2013&dno==63
*also two incs, no redist, and one from a dif district.
*nh 1994 hs hillsborough #35
list year dno mmdpost cand partyt exper outcome termz dtype eseats redist if deter==1&sid==29&sen==0&year>1991&year<1995&dno==35&dname=="hillsborough"
*also two incs, no redist, and one from a dif district.
*wa 1976 hs #36 post #1.
list year dno mmdpost cand partyt exper outcome termz dtype eseats redist if deter==1&sid==47&sen==0&year>1973&year<1977&dno==36
clear
*another post switching inc, who switched posts to face another incumbent.  


use $mainfile

*COME BACK TO THIS LATER
*The following were solutions to errant cases in the old code, before I revised the above.  The following may not work, so I'm not running this for now until I have time to come back to this.  It's barely going to effect anything.  
*Make the inconsistent scores agree.  
*gen temp2=1 if inc==0&(rwinnerb==-1|rwinnerb==1)&eseats==1&uncontlag==0&uncont==0&rcontestb!=-1&rcontestb!=1&dontuse==0&dontuselag==0
*replace rwinnerb=0 if temp2==1
*7 changes, good
*CONTEST
*tab contest
*replace contest="incseatswitch" if temp2==1
*7 changes, good
*drop temp2
*tab contest dtype, missing
*tab year if contest==""&dtype==1

*CLEANUP
drop propup dnamemerge dnomerge geopostmerge propuplag propuplagmin

*CONTEST
*There are very few of these that are missing.  
*DEAL WITH THEM LATER.  
replace contest="unclassifiedforthemoment" if contest==""&year<2018
sum sid if contest==""&year==2018&nextup==2018
local aaa=r(N)
assert `aaa'==0
*None missing in 2018, good.  

*2018CONTESTVARS
sum sid rcontest rcontestb rcontestdperlag rcontestinclag rwinner rwinnerb rwinnerdperlag rwinnerinclag rloser rloserb rloserdperlag rloserinclag if year==2018&nextup==2018
*no more than 3 are missing for any var.
list sid sen dname dno geopost mmdpost if year==2018&nextup==2018&rloserdperlag==.
*those are the two ma cases and the one ri case.  I believe those are going to be called deterministically anyway, but just fill them in in case.  
recode rcontest rcontestb rcontestdperlag rcontestinclag rwinner rwinnerb rwinnerdperlag rwinnerinclag rloser rloserb rloserdperlag rloserinclag (.=0) if year==2018&nextup==2018

save $mainfile, replace





*THE SECTION FROM HERE TO AROUND ROW 1401 ISN'T DONE YET, AND THESE VARIABLES AREN'T USED IN THE AUGUST 7, 2018 RUN.  OCT 30, 2018 NOTE: THE FOLLOWING SECTION WAS USED IN THE AUGUST 26, 2018 RUN, AND PERHAPS SOME OF THE RUNS BETWEEN AUGUST 7, 2018 AND AUGUST 26, 2018.  
*SEE NOTES AT VERY BOTTOM OF THIS SECTION ABOUT A DISCREPANCY THAT WAS UNCOVERED.  IT IS ONLY A SIGN THAT THE ALTLAGPOSSIBLE (OR WHATEVER IT'S CALLED) VAR IS MESSED UP, BUT IT MIGHT GO DEEPER THAN THAT.  

*SUPPLMENTAL LAGGED VARIABLES
*ALTLAGGED: IDENTICAL, NESTED, OTHERPOST, & ALTERNATINGSEAT LAGGED VALUES
*The following code creates a battery of "alternative lagged" variables (they are actually to be used in addition to the lagged variables, not instead of them).  
*These alternates are from 
*1) state senate and house districts that are identical
*2) state house districts that are nested inside of a state senate district (for state senate only)
*3) state house posts or (rarely) state senate posts that represent an identical locale
*4) the last time an election was up in a district with alternating seats, although it was a different seat.  
*Many of the above are especially useful for state senates when redist=6, 7 or 8.  
*vars produced from this
*altlagdper
*altlagcand
*altlaginc
*altlagincuncont
*altlagother
*altlagotheruncont
*altlagpast
*altlagpastuncont
*altlagyear: this variable indicates what year the alternative lagged variables are from.  
*RULES
*If dontuse=1 for the alternative lagged election, don't use it, simply delete it.  
*Floterial districts aren't taken advantage of here.  
*If the non-alternative lagged variables are from two years ago, alternative lagged variables from four years ago aren't used.  
*If the non-alternative lagged variables are from four years ago, alternative lagged variables from two years ago are sometimes used.  This situation is tracked.  
*If the contemporaneous district has been redistricted since it was up last, only bring in lagged nested/identical district info if the contemporaneous district was holdover redistricted.  
*YEARSTOADD
*Based on the work I did in the workbook with the 105 prefix, these are the rules I developed.  These rules are all consistent with the notion that an alternative lagged value shouldn't be from an earlier time than the non-alternative lagged value.  They can be more recent in time, however.  
*#1: state house to state senate: add termz
*#1: state senate to state house: add 2 years, with the exception of md and also the exception of nd for 2000 and after, which have 4 added to them.  
*#2: state house to state senate: add termz
*#2: state senate to state house: not relevant / can't be done.  
*#3: post-mmds: add termz
*#4: alternating seats: add 2, but only if termz=4.  Drop otherwise.  
*#2 and #4 are the only categories where it is possible for the lagged alternate year to be different than the lagged non-alternate year.  Unless there is redistricting, #4 will always have a different alternate lagged year than the non-alternate lagged year.  #2 will be the same if it is a state senate with two year terms or there is a etype=gs election.  
*You have to make sure that the lagged values that are being brought forward to create the alternative lagged values aren't themselves the traditional/non-alternative lagged values.  This is only a potential problem for method #3, and how to deal with this is explained below.  For method #4, alternating seats, there is no threat of collission, as long as the following rule is followed.  Alternating seats are always brought forward two years, and they are only brought forward if termz=4.  
*TARGETDISTRICT
*How the district that the alternative lagged values are going to be matched up with is determined is as follows for the following origin districts.  
*#1: state house to state senate: value of nest1, nest2, nest3.  These will have a weight equal to the number of seats.  There might be two seats in a house district, such as the WA state house with two posts, or the AZ state house with two seat ffa-mmds.  
*#1: state senate to state house: value of dname, dno and geopost brought back from house after matching on nest1, nest2, nest3.  The state senate district goes to each post of that dname, dno, geopost, if there are mmd-posts.  These will probably always have a weight of one, but I could be wrong.  
*#2: state house to state senate: value of nest1, nest2, nest3, then averaged for the districts contained in the nest1, nest2, nest3.  After the collapse / summation of their seats, these will be given a weight of "1" since if they are averaged with anything else, it will be in the state senate, and these house seats are collectively equivalent to one state senate seat.  
*#1a, #1b and #2 can all be grabbed at the same time.  They are all given a suffix of "1."
*#3: both chambers: create a file with all the post-mmds in it.  Number the rows.  Create as many columns as the maximum number of rows in a group.  To be more specific, each "column" in the last sentence actually represents a set of variables defined by dname, dno and geopost.  Put the names of the districts in each "column."  When row number = the column number, delete the contents of that column for that row.  The associated variables that will be "alternative lagged" will also be in the same families.  These will also be turned to system missing when row#="column"#.  The data will then be reshaped long.  All cells with no contents will then be dropped.  The data will then be collapsed to the dname-dno-geopost-mmdpost.  This is then the district-post designation that will be matched on when the lagged variables are brought forward in time.  However, the number of seats (rows - 1) contributing to the computation will be saved, as another collapse will be necessary.  The variables in this battery of variables will have a "2" suffix.  
*#4: both chambers: match will be on sen, dname, dno, geopost, ignoring mmdpost.  If a single election is going towards a a sen-dname-dno-geopost groups with two or more posts, this alternate lagged value will be contributing to more than one election's alternate lagged value.  I'm not sure if  that matters.  The variables in this battery of variables will have a "3" suffix.  
*ALTERATION TO TARGET DISTRICTS NECESSARY: If redist=2, xxx or xxx, redist1, redist2 and redist3 should be used for the "receptors", just like with non-alternative lagged variables.  However, if redist= (i.e., some type of holdover redistricting has occurred), then dname, dno and geopost should be used for the target.  
*COLLISION
*In this context, "collision" means that alternate lagged values from different years are matched with a contemporaneous election.
*See if there are any mismatches in the above, and see how many cases are at stake.  I'll drop the earlier ones if this happens.
*DIFLAGGEDYEARS
*Note that the non-alternative lagged values for an election may be from a different year than the alternative lagged values.  Variables track this.  When they are different, lagged national and state variables have to be used.  How often does this happen?  If seldom, just don't use such cases for alternative lagged variables, and then you don't have to mess with the alternative lagged national and state variables.  
*REDIST
*After these variables are brought forward in time, they are turned to system missing if redist=1 or 5.  Also change it to system missing if redist=6 and altlagtermz=4.  I don't think that's possible, aren't there notes about that somewhere?
*Note that regime may be different between the alternative lagged and contemporaneous/target variables if redist=2, 3 or 4, so using disagreement between regime-alt-lag and regime as a flag isn't useful, unless redist=
clear
use $mainfile
*SENNEST
*district designations are only observed for nest variables when sen=0.  So the state house districts that are equivalent to the state senate districts have to be brought over.  
keep if nest==1&sen==0
gen c=1
collapse (sum) c, by(sid regime dname dno geopost nest1 nest2 nest3)
rename (dname dno geopost nest1 nest2 nest3) (nest1b nest2b nest3b dname dno geopost)
gen sen=1
save temp, replace
use $mainfile
merge m:1 sid sen regime dname dno geopost using temp
*there probably are some state senate districts that "never saw the light of day."  I don't see what can be done if there are.  It doesn't mean anything bad about what is being done now, however, since it doesn't imply a state senate alternative lagged value won't be utilized.  
drop if _merge==2
*there is a problem if _merge=1, however.  There shouldn't be any of those.
tab nest _merge if sen==1
gen flag=nest==1&_merge==1&sen==1
*tab sid if flag==1
*IL and NV only (3 cases for the latter)
*IL doesn't have matches for 1968 to 1980, as suspected.  This is because the entire IL state house before 1981 was dropped, since they had cumulative voting there.  
*NV didn't have matches because they must be matched with the floterial district.  Just let them go, no big deal.  
replace nest1=nest1b if _merge==3&nest==1&sen==1
replace nest2=nest2b if _merge==3&nest==1&sen==1
replace nest3=nest3b if _merge==3&nest==1&sen==1
drop nest1b nest2b nest3b c flag _merge
foreach string in inc other past {
gen `string'uncont=`string'
replace `string'uncont=0 if uncont==1
}
save $mainfile, replace



*ONE & TWO
*1) state senate and house districts that are identical
*2) state house districts that are nested inside of a state senate district (for state senate only)
*Get the identical senate values for the house and the house values for the senate simultaneously, and grab the averaged values for the nested state house districts that will be used for the senate also.  It might have been the case that when there are identical districts, the equivalent state senate and state house districts always have the same designation, which would have saved time.  However, I tested this, and they were sometimes different.  
clear
use $mainfile
keep if nest!=.
drop if nest==2&sen==1
replace nest=1 if nest==3
drop if nest1==""&nest2==.&nest3==.
egen dif=diff(mintermz maxtermz)
tab dif
*only six are different.  
keep sid sen year dper cand inc incuncont other otheruncont past pastuncont nest nest1 nest2 nest3 mintermz maxtermz eseats dontuse
*you don't need 2018 obviously
drop if year==2018
sum
*They are fully observed.  
rename eseats altlagweight
*don't advance the year yet, in case there are two different lagged years inside one contemporaneous year, which will mess up the weight of the averages.  They will get a weight of two (or whatever) if year is altered later.  
collapse (sum) altlagweight (mean) dper cand inc incuncont other otheruncont past pastuncont (min) mintermz=mintermz (max) dontuse maxtermz=maxtermz, by(sid sen year nest nest1 nest2 nest3)
replace altlagweight=1 if sen==0&nest==2
*If dontuse=1 for the alternative lagged election, drop it
gen altlagsysmis1=1 if dontuse==1
*24 cases unusable, good, not many
drop dontuse
*see if mintermz and maxtermz are the same always
egen dif=diff(mintermz maxtermz)
tab dif
*altlagyear: this variable indicates what year the alternative lagged variables are from.  
rename year altlagyear
gen year=altlagyear+mintermz if sen==0
replace year=altlagyear+2 if sen==1
replace year=altlagyear+mintermz if sen==1&(sid==20|(sid==34&year>1998))
drop mintermz maxtermz dif
replace sen=1-sen
foreach string in dper cand inc incuncont other otheruncont past pastuncont {
rename `string' altlag`string'
}
*Test for identification
bysort sid sen nest1 nest2 nest3 year altlagyear: gen sum1=_N
assert sum1==1
*Collision: Test for multiple lagged years within one contemporaneous year
*There is one exception
bysort sid sen nest1 nest2 nest3 year (altlagyear): gen row=_n
bysort sid sen nest1 nest2 nest3 year (altlagyear): gen rows=_N
*Make sure only one case is dropped.
sum sid if row==1&rows==2
local aaa=r(N)
assert `aaa'==1
*if the above runs, only one case was dropped.
drop if row==1&rows==2
drop row rows
bysort sid sen nest1 nest2 nest3 year (altlagyear): replace sum1=_N
assert sum1==1
*there is just one lagged year for each year going forward.  
drop sum1
rename (nest1 nest2 nest3) (dnamemerge dnomerge geopostmerge)
save temp, replace
*MAINFILEMERGE
*Merge first battery of alternative lagged variables into main file.  
clear
use $mainfile
gen dnamemerge=dname
replace dnamemerge=redist1 if redist==2|redist==4|redist==7|redist==9
gen dnomerge=dno
replace dnomerge=redist2 if redist==2|redist==4|redist==7|redist==9
gen geopostmerge=geopost
replace geopostmerge=redist3 if redist==2|redist==4|redist==7|redist==9
*a many to 1 merge must be done because when redist=2, a district designation appears twice.  This won't hurt anything if everything is zeroed out that is redist=1.  The past value is being put into two districts, but since redist=1 for one, the inappropriately matched one will be changed to system missing.  
merge m:1 year sid sen dnamemerge dnomerge geopostmerge using temp
*assess patterns with the merge
tab sid _merge if sen==0
*very few merge=2 there
tab sid _merge if sen==1
*One would expect the number of merge=2 to be about the same as merge=3 for each state where merges are possible and where there are four year senate terms and two years house terms.  This is what one sees.  
drop if _merge==2
drop _merge
*Change altlag values to system missing given specific values of redist
replace altlagsysmis1=1 if altlagyear!=.&(redist==1|redist==5)
*Change altlag values to system missing if redist=6 etc. and year-altlagyear=4 or more
gen dif=year-altlagyear
replace altlagsysmis1=1 if altlagyear!=.&dif>3&dif!=.&(redist==6|redist==8)
drop dif
recode altlagyear altlagweight altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont (*=.) if altlagsysmis1==1
*There is no danger of a lagged value from four years ago leepfrogging redistricting two years ago.  The state senate values are never made to go forward more than two years except for MD and ND for 2000 and on, where there are four year terms for the state house.  The state house values never go more than two years forward, except for MD and ND for 2000 and on.  By design, the alt lagged vars never go four years forward when the contemporaneous district it is matching up with was up two years ago.  There might be some problems bcs of gs elections in nd in 2000 and later.  
save $mainfile, replace
*THREE
*3) state house posts that represent an identical locale and are up simultaneously
clear
use $mainfile
keep if dtype==2|dtype==5|dtype==8
*make sure there are at least two rows in each group
bysort sid sen year dname dno geopost (mmdpost): gen rows=_N
tab rows
*none have just one row, but that may well change when dontuse=1 cases are dropped.  That's okay, the counterpart can still provide a lagged value.
egen dif=diff(mintermz maxtermz)
tab dif
*no difs
*you don't need 2018 obviously
drop if year==2018
*get rid of dontuse=1 cases: this has to be done after changing the diagonal of the matrix to system missing, or the singletons won't go on for their counterparts.  In fact, don't drop them, change them to system missing.  
*get rid of the "zzz" malarkey.  
destring specpost, force replace
*It won't hurt anything to do the following now.  The matrix of post mmds will still be done correctly.  
replace mmdpost=specpost if specpost!=.
keep sid sen year dname dno geopost mmdpost dper cand inc incuncont other otheruncont past pastuncont mintermz eseats dontuse
rename eseats weight
rename mintermz termz
*create a row number
bysort sid sen year dname dno geopost (mmdpost): gen row=_n
save temp, replace
*major restructure
egen rowmax=max(row), by(sid sen year dname dno geopost)
sum row
local aaa=r(max)
*build out columns, but only give cells content for columns that are the same or less in number as the number of rows in a group defined by sid-sen-dname-dno-geopost-year.  Also make cells system missing if dontuse=1 for the 
*Step one is to track how many unusable columns there are so that a case with altlag values that should be sysmis are tracked.
forvalues bbb=1/`aaa' {
gen dontuse2`bbb'=weight if (row!=`bbb')&(`bbb'<=rowmax)&dontuse==1
replace dontuse2`bbb'=0 if dontuse2`bbb'==.
}
foreach string in dper cand inc incuncont other otheruncont past pastuncont weight {
forvalues bbb=1/`aaa' {
gen altlag`string'2`bbb'=`string' if (row!=`bbb')&(`bbb'<=rowmax)&dontuse==0
}
drop `string'
}
collapse (sum) dontuse21- altlagweight219, by(year sid sen dname dno geopost)
reshape long dontuse2 altlagdper2 altlagcand2 altlaginc2 altlagincuncont2 altlagother2 altlagotheruncont2 altlagpast2 altlagpastuncont2 altlagweight2, i(year sid sen dname dno geopost) j(row)
drop if altlagweight2==0&dontuse2==0
gen altlagsysmis2=1 if altlagweight2==0
drop dontuse2
save temp2, replace
*temp2.dta has to be merged into temp.dta to grab mmdpost values to merge into the main dataset.  
use temp
drop weight dontuse dper cand inc other past incuncont otheruncont pastuncont
merge 1:1 year sid sen dname dno geopost row using temp2
erase temp2.dta
assert _merge==3
*THE FOLLOWING WAS TRUE, HOPEFULLY ISN'T NOW.  merge=3 for 5527 cases, merge=1 for 5 cases, and merge=2 for no cases.  The merge=1 cases are the ones that have dontuse=1 for all their counter-parts (which I would guess is one out of one of their counter-parts).  
drop _merge row
*altlagyear: this variable indicates what year the alternative lagged variables are from.  
rename year altlagyear2
gen year=altlagyear2+termz
drop termz
*Test for identification
bysort sid sen dname dno geopost mmdpost year altlagyear2: gen sum1=_N
assert sum1==1
*assertion correct, no problem
*Collision: Test for multiple lagged years within one contemporaneous year
*There is one pair.
bysort sid sen dname dno geopost mmdpost year (altlagyear2): gen row=_n
bysort sid sen dname dno geopost mmdpost year (altlagyear2): gen rows=_N
drop if row==1&rows==2
drop row rows
bysort sid sen dname dno geopost mmdpost year (altlagyear2): replace sum1=_N
assert sum1==1
*assertion now correct, no problem
drop sum1
*change the lagged variables into means
foreach string in altlagdper2 altlagcand2 altlaginc2 altlagincuncont2 altlagother2 altlagotheruncont2 altlagpast2 altlagpastuncont2 {
replace `string'=`string'/altlagweight2
}
*rename
rename (dname dno geopost) (dnamemerge dnomerge geopostmerge)
save temp, replace
*MAINFILEMERGE
clear
use $mainfile
merge m:1 sid sen dnamemerge dnomerge geopostmerge mmdpost year using temp
drop if _merge==2
drop _merge
*Change altlag values to system missing given specific values of redist
replace altlagsysmis2=1 if altlagyear2!=.&(redist==1|redist==6|redist==8)
recode altlagyear2 altlagweight2 altlagdper2 altlagcand2 altlaginc2 altlagincuncont2 altlagother2 altlagotheruncont2 altlagpast2 altlagpastuncont2 (*=.) if altlagsysmis2==1
*Lots of changes, as expected.  
save $mainfile, replace
*FOUR
*4) the last time an election was up in a district with alternating seats, although it was a different seat or seats.
*The mmdpost should also be different with what it is being matched with.  However, this can't be easily done, as I'm mixing the alternating post-mmds with the alternating single seaters.  
clear
use $mainfile
keep if (dtype==4|dtype==5|dtype==6)&dontuse==0&year<2018
egen dif=diff(mintermz maxtermz)
tab dif
*there are only 16
*Drop them, they may cause problems.  
drop if dif==1
drop dif
rename mintermz termz
drop if termz==2
*135 dropped
*Don't deal with the weird cases in specpost, they are ffa-mmds that are being lagged anyway.  This would be redundant.  
drop if specpost!=""
keep sid sen year dname dno geopost mmdpost dper cand inc incuncont other otheruncont past pastuncont eseats
rename eseats altlagweight3
foreach string in dper cand inc incuncont other otheruncont past pastuncont {
rename `string' altlag`string'3
}
sum
*everything is fully observed.  
bysort sid sen year dname dno geopost: gen sum=_N
tab sum
*only 87.8% have values of 1, so they'll have to be collapsed.  
collapse (sum) altlagweight3 (mean) altlagdper3 altlagcand3 altlaginc3 altlagincuncont3 altlagother3 altlagotheruncont3 altlagpast3 altlagpastuncont3, by(sid sen year dname dno geopost)
*altlagyear: this variable indicates what year the alternative lagged variables are from.  
rename year altlagyear3
gen year=altlagyear3+2
*rename
rename (dname dno geopost) (dnamemerge dnomerge geopostmerge)
*Test for identification
bysort sid sen dnamemerge dnomerge geopostmerge year altlagyear3: gen sum1=_N
assert sum1==1
*Collision: Test for multiple lagged years within one contemporaneous year
bysort sid sen dnamemerge dnomerge geopostmerge year (altlagyear3): replace sum1=_N
assert sum1==1
*all 1, good, none of multiple years in them.  
drop sum1
save temp, replace
*MAINFILEMERGE
clear
use $mainfile
merge m:1 sid sen dnamemerge dnomerge geopostmerge year using temp
drop if _merge==2
drop dnamemerge dnomerge geopostmerge _merge
*Change altlag values to system missing given specific values of redist
*what values of redist are observed when altlag3 vars are observed?
tab redist if altlagyear3!=.
*almost all are present, excepting 3 and 5
*There is only one value of 6, that's probably a mistake.  
list sid sen year dname dno geopost if altlagyear3!=.&redist==6
*ak 1976 sen, dname=o
tab year redist if sid==2&sen==1
list year redist if sid==2&sen==1&dname=="o"
*It definitely should be a 7.  
replace redist=7 if altlagyear3!=.&redist==6
*one change made, good.
gen altlagsysmis3=1 if altlagyear3!=.&redist==1
recode altlagyear3 altlagweight3 altlagdper3 altlagcand3 altlaginc3 altlagincuncont3 altlagother3 altlagotheruncont3 altlagpast3 altlagpastuncont3 (*=.) if altlagsysmis3==1
*Lots of changes, as expected.  
save $mainfile, replace




clear
use $mainfile
*Create three variables indicating whether the three types are possible.  
rename altlagyear altlagyear1
forvalues aaa=1/3 {
sum sid if altlagsysmis`aaa'==1&altlagyear`aaa'!=.
local bbb=r(N)
assert `bbb'==0
gen pos`aaa'=1 if altlagsysmis`aaa'==1|altlagyear`aaa'!=.
}
rename altlagyear1 altlagyear
*ALTLAGYEAR COLLISIONS
gen collision12=altlagyear-altlagyear2
gen collision13=altlagyear-altlagyear3
gen collision23=altlagyear2-altlagyear3
tab collision12
*only 6 collisions.  In all 6, altlagyear1 is always two more than altlagyear2.  That means altlagyear1 is more recent than altlagyear2 when they differ.  Change altlagyear2's value to system missing when they differ.  
tab collision13
*no collisions
tab collision23
*78 collisions: altlagyear3 is always 2 greater than altlagyear2 when there is collision.  That means that altlagyear3 is always more recent.  Change altlagyear2's value to system missing when they differ.  
foreach string in altlagyear altlagweight altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont {
replace `string'2=. if collision12==2|collision23==-2
}
drop collision12 collision13 collision23
*ALTLAG AVERAGED
*Make a weighted average of the alt lagged variables
*1 and 2
gen temp=altlagyear!=.&altlagyear2!=.
foreach string in altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont {
replace `string'=`string'2 if `string'==.
replace `string'=[(`string'*altlagweight)+(`string'2*altlagweight2)]/(altlagweight+altlagweight2) if temp
drop `string'2
}
replace altlagweight=altlagweight2 if altlagweight==.
replace altlagweight=altlagweight+altlagweight2 if temp
replace altlagyear=altlagyear2 if altlagyear==.
*otherwise altlagyear will be the same as altlagyear3 (otherwise=they are co-observed).
drop altlagweight2 altlagyear2
*1/2 and 3
replace temp=altlagyear!=.&altlagyear3!=.
foreach string in altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont {
replace `string'=`string'3 if `string'==.
replace `string'=[(`string'*altlagweight)+(`string'3*altlagweight3)]/(altlagweight+altlagweight3) if temp
drop `string'3
}
replace altlagweight=altlagweight3 if altlagweight==.
replace altlagweight=altlagweight+altlagweight3 if temp
replace altlagyear=altlagyear3 if altlagyear==.
*otherwise altlagyear will be the same as altlagyear3 (otherwise=they are co-observed).
drop altlagweight3 altlagyear3 temp
*ALTLAGYEAR V LAGYEAR
*How often is altlagyear# different than lagyear (aka minlagyear and maxlagyear).  
*altlagyear will always be different if lagyearmin!=lagyearmax.  
egen dif=diff(lagyearmin lagyearmax)
gen dif2=lagyearmin-altlagyear
tab dif2 if dif==0
*It's different 3,930 out of 12,517 times (31.42%).  
*2 cases see lagyearmin two years greater than altlagyear, so those should be changed to system missing for altlagyear.  
foreach string in altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont {
replace `string'=. if dif2==2&dif==0
}
*The other 3,930 cases see altlagyear two years greater than lagyearmin, which is entirely according to plan.
*Break altlag variables into two groups on the basis of dif2 (plus adding any cases of altlag that are observed when dif=1 to the group with dif2=-2).
gen altlaggroup=1 if dif==0&dif2==0
replace altlaggroup=2 if (dif==1&altlagyear!=.)|(dif==0&dif2==-2)
*ALTLAG: ZERO FILL
*Many system missing values of altlag variables can be changed to 0, because they can't possible have a value.  Use altlaggroup to keep track of these, before moving those into the rest of the altlag variables.  
*The cases with altlagsysmis1, 2 and 3 can't be "0."
*Create variables indicating a district-regime never had 1) nested/identical districts, 2) post-mmds and 3) alternating seats.  
*The problem is that a district-regime may never have had these, but alt lagged values when there are values of redist=2, or other redist values, could be possible.  Here are the possibilities for the three types.  In other words, the district-regime doesn't have the attribute in question, but if the district-regime before it did, then it could have such a value if redist has specific values.  
*#1/2 (ident/nest): redist=2, 3, 4 (house districts can only go forward 4 years if there are both four year house and senate terms in a state, and these always happen to be either non-staggered in the state as a whole, or in the case of nd for 2000 and on, in sync in the district (except for a few etype=gs elections).  Therefore, state house elections as altlag vars from an earlier regime can never hit a holdover redistricted state senate election.  Since state house districts never have holdover redistricting (except for nd hs for 2000 and on), this can't be a problem for them.  In nd 2000 and on, this also won't happen.  
*#3 (postmmds): redist=2, 3, 4, 5, 7, 8, 9
*#4 (alternating): redist=2 (since these never advanced more than 2 years, they could never hit a redist=6, 7, 8 or 9 if they started in a prior regime).  
gen temp1=nest!=.
gen temp2=dtype==2|dtype==5|dtype==8
gen temp3=dtype==4|dtype==5|dtype==6
*if a chamber never had nests, post-mmds or alternating seats, you can also change altlag vars from sysmis to 0.  
forvalues aaa=1/3 {
egen max=max(temp`aaa'), by(sid sen)
replace pos`aaa'=0 if max==0
drop max
}
*Excepting the values of redist outlined above for each of the three things, if a redist-regime never has that type of thing, pos# should be changed to 0.  
forvalues aaa=1/3 {
egen max`aaa'=max(temp`aaa'), by(sid sen dname dno geopost)
}
replace pos1=0 if pos1==.&max1==0&redist!=3&redist!=4&redist!=5
replace pos2=0 if pos2==.&max2==0&redist!=2&redist!=3&redist!=4&redist!=5&redist!=7&redist!=8&redist!=9
replace pos3=0 if pos3==.&max3==0&redist!=2
*If only nesting appears in a state, and not identical districts, then house districts can never have a nested altlag value.
gen tempnest=nest==1|nest==3
egen max=max(tempnest), by(sid sen)
replace pos1=0 if pos1==.&max==0&sen==0
drop tempnest max
*Simply replace pos1=0 if nest==.|nest==2
replace pos1=0 if (nest==.|nest==2)&pos1==.&sen==0
list year sid sen dname dno geopost mmdpost redist if pos1==1&(nest==.|nest==2)&sen==0
*I can see why those are pos1=1 even though they are nested.  For my theory to work, they should be redist=2 or 7, but they aren't.  

*LOOK INTO THE ABOVE LATER.

*If no such thing (nest, postmmd, or alternating) appeared in the state in the last four years, then altlagged values from a nest, postmmd or alternating seat would be impossible.  
forvalues aaa=1/3 {
forvalues bbb=1972(2)2018 {
local ccc=`bbb'-4
gen temp=year<=`bbb'&year>=`ccc'
egen max=max(temp`aaa'), by(sid temp)
sum sid if max==0&pos`aaa'==1&year==`bbb'&temp==1
local bbb=r(N)
assert `bbb'==0
replace pos`aaa'=0 if max==0&pos`aaa'==.&year==`bbb'&temp==1
drop temp max
}
}
*If no such thing (postmmd, or alternating) appeared in the state-chamber in the last four years, then altlagged values from a postmmd or alternating seat would be impossible.  It is possible for a nest, although I doubt it would actually happen.  
forvalues aaa=2/3 {
forvalues bbb=1972(2)2018 {
local ccc=`bbb'-4
gen temp=year<=`bbb'&year>=`ccc'
egen tempyear=max(temp`aaa'), by(sid sen temp)
sum sid if tempyear==0&pos`aaa'==1&year==`bbb'&temp==1
local bbb=r(N)
assert `bbb'==0
replace pos`aaa'=0 if tempyear==0&pos`aaa'==.&year==`bbb'&temp==1
drop temp tempyear
}
}
*If any one of pos1, pos2 and pos3 are coded "1," then it doesn't matter if the other two are sysmis or not.  
sum sid pos1 pos2 pos3 if pos1!=1&pos2!=1&pos3!=1
*there are still missing values for all three, although pos3 is close to fully observed, and the other two aren't missing that many.  

*CODE THE REST AS EITHER POSSIBLE OR NOT LATER
sum altlagyear altlagweight altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont altlagsysmis1 altlagsysmis2 altlagsysmis3 if pos1==0&pos2==0&pos3==0
*all sysmis, good
sum altlagyear altlagweight altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont
*all equal numbers of cases, good.  Except altlagyear and altlagweight have two extra cases.  
list sid sen year if altlagyear!=.&altlagdper==.
*nd sen 2004 and nd hs 2018.  They are probably bcs of mirved house seats, as a result of etype=gs elections.

*GO BACK AND DEAL WITH THESE TWO ND CASES LATER

*make sure there is a 1-to-1 correspondence re being obs or not.  
sum altlagyear altlagweight altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont if altlagpastuncont!=.
*yes, there is.  
sum altlagyear altlagweight altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont altlagsysmis1 altlagsysmis2 altlagsysmis3 if altlagpastuncont!=.
*The sysmis vars are sometimes observed.  That's because one type sometimes couldn't be used when another type could be.  They won't cause problems, just ignore them.  #3 didn't have any.  
recode altlagsysmis1 altlagsysmis2 (*=.) if altlagpastuncont!=.

*If pos1, pos2 and pos3 are all 0, then altlag vars can be changed to "0."
*Dummy var indicating altlag exists.
gen altlagpossible=1 if altlagyear!=.|altlagsysmis1==1|altlagsysmis2==1|altlagsysmis3==1
replace altlagpossible=0 if altlagpossible==.&pos1==0&pos2==0&pos3==0

*If a state always has nested or identical districts in the time period in question, then altlagpossible should=1.
gen tempnest=nest!=.
egen min=min(tempnest), by(sid)
tab min
*about 19% are min=1.
tab sid min
*I think the following states should be included, but aren't: ia md nd or sd wy
*Perhaps in the very early years they didn't have identical or nested districts.  
tab sid if nest==.
*except for nd, all of those states have very few sysmis for nest.  
foreach num in 15 20 34 37 41 50 {
di "`num'"
tab year nest if sid==`num', missing
}
*ia is 1968 only
*md is 1968 only
*nd is up to 1990
*or is 1968 and 1970 only
*sd is 1968 and 1970 only
*wy is 1968 to 1980 only
*Before implementing the changes altlagpossible, see if it is consistent with min.
tab altlagpossible if min==1
*There are a lot of 0 values, so there are errors in the code above.  

*PATCH
*For now, just do the following.  
replace altlagpossible=0 if year==2018&nextup==2018&altlagpossible==.
recode altlagweight altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont (.=0) if altlagpossible==0|(year==2018&nextup==2018)

gen altobs="voteobs" if altlagdper!=.&altlagdper!=0
replace altobs="partyheldobs" if altobs==""&((altlagcand!=0&altlagcand!=.)|(altlaginc!=0&altlaginc!=.)|(altlagother!=0&altlagother!=.)|(altlagpast!=0&altlagpast!=.))
replace altobs="none" if altobs==""

save $mainfile, replace



*CHECK
*2018 MISSINGNESS
*Evaluate whether lagged values appear in all the cases they should for the 2018 cases.
clear
use $mainfile
*Are all cases with missing lagged values in 2018 nextup=2020?
assert nextup==2020 if year==2018&inclag==.
*Yes, great
sum dontuselag if (year==2018&inclag==.&nextup==2018)
local aaa=r(N)
assert `aaa'==0
*no obs, good
clear




*MAINFILE
clear
use $mainfile

*2018MISSING
*assess if vars are missing in 2018
foreach string in dperlag candlag inc inc2 inc3 inclag inc2lag inc3lag other otherlag past pastlag wave wavelag dheld rheld oheld eseats {
di "`string'"
assert `string'!=. if year==2018&nextup==2018&(!(sid==39&sen==0&dno==36))&(!(sid==45&sen==0&(dname=="addisonrutland"|dname=="windhambenningtonwindsor")))
}
*none are missing, good.
*assess if vars are missing in 2018 if nextup=2020 among vars that shouldn't be missing (lagged vars).  
foreach string in dheld rheld oheld eseats {
di "`string'"
assert `string'!=. if year==2018&nextup==2020
}
*none are missing, good.
*make vars that should be missing in 2020 system missing.  some of these were non-missing because of stata's error with summing in collapse.  wavelag doesn't necessarily have to be sysmis, but most are, so for consistency I'm making it sysmis under those conditions.  
recode inc other past wave wavelag (*=.) if year==2018&nextup==2020
*The following are variables that don't necessarily have to be missing, but they are all missing.  Confirm they're all missing for consistencies sake.  
foreach string in dperlag candlag inclag otherlag pastlag {
di "`string'"
assert `string'==. if year==2018&nextup==2020
}
*They are all missing

*INC*PERIOD
*Arbitrarily divide the years studied into four periods.  
*26 bienniums examined, including 2018, so give periods 7, 7, 6 and 6 elections.  
*Given that there are year level error terms, are the dummy variables representing periods necessary?  And the lagged ones are a joke, and their intersection with the non-lagged ones has potentially very damaging consequences.  
rename year a
rename lagyearmin b
rename lagyearmax c
foreach let in a b c {
recode `let' (1968/1980=1), gen(period1`let')
recode period1`let' (1111/max=0)
recode `let' (1982/1994=1), gen(period2`let')
recode period2`let' (1111/max=0)
recode `let' (1996/2006=1), gen(period3`let')
recode period3`let' (1111/max=0)
recode `let' (2008/2018=1), gen(period4`let')
recode period4`let' (1111/max=0)
}
rename a year
rename b lagyearmin
rename c lagyearmax
forvalues i=1/4 {
rename period`i'a period`i'
gen period`i'lag=(period`i'b+period`i'c)/2
drop period`i'b period`i'c
gen incperiod`i'=inc*period`i'
gen incperiod`i'lag=inclag*period`i'lag
}

*UNCONTLAG-INTERACTIONS
*Interactions between uncontlag and inclag, otherlag and pastlag.  
*The impact of past incumbency on change in vote share will be different depending on whether the last election was contested or not.  
gen incuncontlag=inclag*uncontlag
*Are there cases of an incumbent of one party being the only cand of their party running and being opposed by two cands of the other party?
tab inclag candlag if eseats==2
*there are 27 cases like that.  Unfortunately that it isn't 0, but it's not enough to justify modeling it.  I don't want to weigh the model down with tons of parameters, and they will already be controlled for additively for both Xs (partially contested / incumbency).  
gen otheruncontlag=otherlag*uncontlag
gen pastuncontlag=pastlag*uncontlag
tab incuncontlag
tab otheruncontlag
tab pastuncontlag
*Those look good.  

*See if those are always observed in year=2018, nextup=2018.  
foreach string in incuncontlag otheruncontlag pastuncontlag {
di "`string'"
assert `string'!=. if year==2018&nextup==2018
}

save $mainfile, replace








clear
use $mainfile
*HOLDOVERS & NONMAJ INCS
*For the following code, I utilized the following sources of information.
*Vanessa's spreadsheet that tracked people of one party who caucus with another party, and how non-dem/non-repubs caucus.  
*SLERs nonmaj cands who won in 2014 to a four year term or 2016 for a two year term.  I then looked to see if they were running or not in 2018.  If they were, I made a determination of which party they would caucus with.
*A list of party switchers who won I generated from SLERs & my 2018 cand list.  
*The work from this is in the files 007_109partyweird20180807_20180809altbyklarner and 007_nonmajpartiesfromvanessa20180809altbyklarner.  The code below is implied by what is written in the columns on the far right.  
*One problem is that some of the nonmaj party incs are in ffammds (but never more than one).  When this is the case, subtract eseats by "1."  eseats2 captures this.  For this run, three of these cands are in such elections (2 or more seats).  
gen eseats2=eseats
*Paul Seaton, ak hs 31: was a repub, caucused with dems, now running as an indep.  opposed by a repub, but not a dem.  I'm giving him a 50-50 chance of winning the general.  
gen temp=sid==2&sen==0&dno==31&year==2018
gen dem50=temp
gen dheld2=0 if temp
gen rheld2=0 if temp
gen weirdcaucus=temp
*Louise B. Stutes: ak hs 32: was a repub, caucused with  dems, still running as repub.  running as a republican again, I think she'll caucus with the dems again, but she might lose her primary against the other repub running in the repub primary.  I give her a 50-50 chance of winning the primary and a 100% chance of caucusing with the dems if she wins.
*OCT 29, 2018: She won her primary, but is opposed by a non-partisan also in the general, and I think that person is a republican in sheep's clothing.  I'm going to deal with this new development by simply assuming it will exactly offset the old development, which is reasonable and also has the added benefit of me not having to alter the code.  
replace temp=sid==2&sen==0&dno==32&year==2018
replace dem50=2 if temp
replace dheld2=0 if temp
replace rheld2=0 if temp
replace weirdcaucus=1 if temp
*mcelroy, mark: ar hs 11: was a dem, became an indep, is running in 2018.  I couldn't find info on who he'll caucus with, so I'm just calling him a dem.
replace temp=year==2018&sid==4&sen==0&dno==11
replace dheld2=1 if temp
replace rheld2=0 if temp
replace eseats2=eseats2-1 if temp
replace weirdcaucus=1 if temp
gen exception=temp&eseats>1&eseats!=.
*Kent Ackley: me hs #82: independent that caucuses with dems
replace temp=year==2018&sid==19&sen==0&dno==82
replace dheld2=1 if temp
replace rheld2=0 if temp
replace eseats2=eseats2-1 if temp
replace weirdcaucus=1 if temp
replace exception=1 if temp&eseats>1&eseats!=.
*Owen Casas: me hs #94: independent that caucuses with dems
replace temp=year==2018&sid==19&sen==0&dno==94
replace dheld2=1 if temp
replace rheld2=0 if temp
replace eseats2=eseats2-1 if temp
replace weirdcaucus=1 if temp
replace exception=1 if temp&eseats>1&eseats!=.
*higgins, norman: me hs #120: was a repub, now running as an indep
*https://bangordailynews.com/2017/10/17/politics/maine-lawmaker-leaves-republican-party/ doesn't say how he will caucus.  Other articles didn't say who he would caucus with.  He went with the dems on the budget, he was the only republican to vote on their side, so I'm going to say he'll caucus with the dems.
replace temp=year==2018&sid==19&sen==0&dno==120
replace dheld2=1 if temp
replace rheld2=0 if temp
replace eseats2=eseats2-1 if temp
replace weirdcaucus=1 if temp
replace exception=1 if temp&eseats>1&eseats!=.
*Caleb Q. Dyer: nh hs hills37, libertarian caucuses with repubs
replace temp=year==2018&sid==29&sen==0&dname=="hillsborough"&dno==37
replace dheld2=0 if temp
replace rheld2=1 if temp
replace eseats2=eseats2-1 if temp
replace weirdcaucus=1 if temp
replace exception=1 if temp&eseats>1&eseats!=.
*Brandon Phinney: nh hs straff24, libertarian caucuses with repubs
replace temp=year==2018&sid==29&sen==0&dname=="strafford"&dno==24
replace dheld2=0 if temp
replace rheld2=1 if temp
replace eseats2=eseats2-1 if temp
replace weirdcaucus=1 if temp
replace exception=1 if temp&eseats>1&eseats!=.
*RI
*I'm not sure if this has been filled in yet.  HS #36 had an independent, but he switched to being a republican, and is unopposed in the 2018 general election.  
replace temp=year==2018&sid==39&sen==0&dno==36
replace dheld2=0 if temp
replace rheld2=1 if temp
replace eseats2=eseats2-1 if temp
replace weirdcaucus=1 if temp
replace exception=1 if temp&eseats>1&eseats!=.
*VT
*Kelly Pajala: running unopposed in windhambenningtonwindsor.  She is an indepenent, but was appointed by a republican governor, and is pro-school choice.  http://www.chestertelegraph.org/2017/10/16/rep-olsen-announces-resignation-from-state-house-town-clerk-pajala-throws-hat-in-ring/.  I'm coding her as a Republican, I couldn't find any statements about which party she was going to caucus with. 
replace temp=year==2018&sid==45&sen==0&dname=="windhambenningtonwindsor"
replace dheld2=0 if temp
replace rheld2=1 if temp
replace eseats2=eseats2-1 if temp
replace weirdcaucus=1 if temp
replace exception=1 if temp&eseats>1&eseats!=.
*All the following are in the VT HS
*Terry Norris addison-rutland: independent incumbent caucuses with dems
*Barbara Murphy franklin 2: independent incumbent caucuses with dems
*Ben Jickling orange-washington-addison: independent incumbent caucuses with dems
*Paul Poirier washington 3: independent incumbent caucuses with dems
*Laura Sibilia windham-bennington: independent incumbent caucuses with dems
replace temp=year==2018&sid==45&sen==0&(dname=="addisonrutland"|(dname=="franklin"&dno==2)|dname=="orangewashingtonaddison"|(dname=="washington"&dno==3)|dname=="windhambennington")
replace dheld2=1 if temp
replace rheld2=0 if temp
replace eseats2=eseats2-1 if temp
replace weirdcaucus=1 if temp
replace exception=1 if temp&eseats>1&eseats!=.
drop temp

*HOLDOVER
*Party that holds holdover seats in 2018
replace dheld2=dheld if dheld2==.&nextup==2020&year==2018
replace rheld2=rheld if rheld2==.&nextup==2020&year==2018

*FLIPPEDHOLDOVERS
*Some seats up next in the general elections of 2020 have had special elections that flipped party.  These have to be taken into account for the simulations of chamber control.  
*the following went from d to r
replace dheld2=0 if sid==5&sen==1&dno==29&year==2018
replace rheld2=1 if sid==5&sen==1&dno==29&year==2018
*the following went from r to d
replace dheld2=1 if sid==25&sen==1&dno==17&year==2018
replace rheld2=0 if sid==25&sen==1&dno==17&year==2018
*the following went from r to d
replace dheld2=1 if sid==49&sen==1&dno==10&year==2018
replace rheld2=0 if sid==49&sen==1&dno==10&year==2018
*The following was the result of a 2017 special election
replace dheld2=1 if sid==36&sen==1&dno==37&year==2018
replace rheld2=0 if sid==36&sen==1&dno==37&year==2018

*UNCONT
*Party that will win seats that are uncontested this time
sum sid if (uncont==.|dcand==.|rcand==.|eseats2==.)&(dheld2==.|rheld2==.)&year==2018&nextup==2018
local aaa=r(N)
assert `aaa'==0
replace dheld2=(dcand*eseats2) if dheld2==.&uncont==1&year==2018&nextup==2018
replace rheld2=(rcand*eseats2) if rheld2==.&uncont==1&year==2018&nextup==2018

*CHECK
assert dheld2!=. if rheld2!=.
assert rheld2!=. if dheld2!=.
*same number of cases as each other, good
sum year if ((dperlag==.)|(candlag==.)|(inc==.)|(inclag==.)|(other==.)|(otherlag==.)|(past==.)|(pastlag==.)|(wave==.)|(wavelag==.))&dheld2==.&year==2018
local aaa=r(N)
assert `aaa'==0
*none missing when dheld2 is missing, good.  

*CROSSPARTY
*ny 2018 assembly #142 ran as repub and won, but caucuses with the dems.  That is erik bohen.  He is running against a dem in the gen and a repub in the primary.  He might lose the primary, so I'm giving a repub win there a 50% chance of being a dem seat when it comes to caucusing.  "irc" is a made up term meaning "independent republican caucus" to make it easy to remember it is the opposite of the idc (Independent Democratic Conference).  
*OCT 29, 2018 NOTE: Erik Bohen won the Republican primary.  
gen irc=sid==32&sen==0&dno==142&year==2018
*Get more of the cross-party etc caucusers from above, for this variable.  
replace weirdcaucus=1 if irc==1
replace weirdcaucus=1 if year==2018&sid==32&((sen==1&dno==17)|(sen==0&dno==62))

*NEEDEST
*Create a variable that indicates which contests need an estimate from the model.  Note that three of these cases (all ffammds) will be a combination of an assumption about a third party candidate winning, and an estimate from the model for the other seats in the district.  
gen needest=(dheld2==.|exception==1) if year==2018
list sid sen dname dno geopost mmdpost exception needest if weirdcaucus==1
*That all looks good.  
assert dheld2!=.&rheld2!=. if year==2018&needest==0

*DHELD2/RHELD2
*Fill in the system missing values with 0.  This is so these variables can be added to the estimates.  
recode dheld2 rheld2 (.=0) if year==2018

gen yearsid=(year*100)+sid

*Y
replace dper=. if year==2018

*NC 2018
foreach string in candlag incuncontlag otheruncontlag pastuncontlag {
gen `string'2=`string'
recode `string' (*=0) if sid==33&year==2018&redist==1
}

*MODELVARS
*Verify that all values of Xs are observed in 2018 when needest=1.  
local varlist dperlag cand candlag2 incuncontlag2 incperiod1 incperiod2 incperiod3 incperiod4 incperiod1lag incperiod2lag incperiod3lag incperiod4lag inc2 inc3 other otherlag otheruncontlag2 past pastlag pastuncontlag2 switch switchwin stealth stealthwin sen wave wavelag smidpen smidpenlag rcontest rcontestb rcontestdperlag rcontestinclag rwinner rwinnerb rwinnerdperlag rwinnerinclag rloser rloserb rloserdperlag rloserinclag altlagpossible altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont
foreach string of local varlist {
di "`string'"
sum sid if `string'==.&year==2018&needest==1
local aaa=r(N)
assert `aaa'==0
*Why does the following run, it shouldn't.  There aren't any missing values.  I guess I don't understand how assert works completely.  Come back to this.  
*assert `string'==. if needest==1&year==2018
}
*None of those are missing under those conditions.  

*Are dheld2 and rheld2 always observed when needest==0&year==2018?
sum sid if (dheld2==.|rheld2==.)&needest==0&year==2018
local aaa=r(N)
assert `aaa'==0
sum sid if eseats2==.&year==2018
local aaa=r(N)
assert `aaa'==0

save $mainfile, replace





*DROP-1 ANALYSES
clear
use $mainfile

*DROP
drop if uncont==1|dontuse==1|bigthird!=0|year==2018
gen dper2=dper
gen dperlag2=dperlag if dontuselag==0&bigthirdlag==0
local varlist dper2 dperlag2 cand candlag2 incuncontlag2 incperiod1 incperiod2 incperiod3 incperiod4 incperiod1lag incperiod2lag incperiod3lag incperiod4lag inc2 inc3 other otherlag otheruncontlag2 past pastlag pastuncontlag2 switch switchwin stealth stealthwin sen wave wavelag smidpen smidpenlag rcontest rcontestb rcontestdperlag rcontestinclag rwinner rwinnerb rwinnerdperlag rwinnerinclag rloser rloserb rloserdperlag rloserinclag altlagpossible altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont year yearsid
keep `varlist' sid dname dno geopost mmdpost contest altobs
xtset, clear
save temp, replace

forvalues ddd=1968(2)2016 {
clear
use temp
save $drop1file`ddd', replace
replace dper2=. if year==`ddd'
*Create two local macros.  One has a list of fully observed variables, the other has a list of variables that aren't fully observed.
local missvars ""
local obsvars ""
foreach string of local varlist {
sum sid if `string'==.
local aaa=r(N)
if `aaa'!=0 {
local missvars `missvars' `string'
}
else {
local obsvars `obsvars' `string'
}
}
mi set mlong
mi register imputed `missvars'
mi impute mvn `missvars' = `obsvars', add(5) rseed (244922)
replace dper2=. if year==`ddd'
mi estimate, saving($drop1file`ddd', replace): xtmixed dper2 dperlag2 cand candlag2 incuncontlag2 incperiod1 incperiod2 incperiod3 incperiod4 incperiod1lag incperiod2lag incperiod3lag incperiod4lag inc2 inc3 other otherlag otheruncontlag2 past pastlag pastuncontlag2 switch switchwin stealth stealthwin sen wave wavelag smidpen smidpenlag rcontest rcontestb rcontestdperlag rcontestinclag rwinner rwinnerb rwinnerdperlag rwinnerinclag rloser rloserb rloserdperlag rloserinclag altlagpossible altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont || year: || yearsid:
mi predict pre`ddd' using $drop1file`ddd'
save $drop1file`ddd', replace
}


I then moved the files "by hand" to subfolder 001drop1files20181030.  





*MERGE
*Merge the drop-1 analysis estimated values together.  
clear
cd E:\Dropbox\08_FORECAST\005runs\run20181030\001drop1files20181030
use 112drop1file201810301968
keep if _mi_m==0
keep year sid sen dname dno geopost mmdpost pre1968
cd E:\Dropbox\08_FORECAST\005runs\run20181030
save 112drop1file20181030, replace
forvalues ddd=1970(2)2016 {
clear
cd E:\Dropbox\08_FORECAST\005runs\run20181030\001drop1files20181030
use 112drop1file20181030`ddd'
keep if _mi_m==0
keep year sid sen dname dno geopost mmdpost pre`ddd'
cd E:\Dropbox\08_FORECAST\005runs\run20181030
merge 1:1 year sid sen dname dno geopost mmdpost using 112drop1file20181030
drop if _merge==2
assert _merge==3
drop _merge
save 112drop1file20181030, replace
}
erase temp.dta

*ASSESS
*See if the estimates for a year that's been dropped in the drop1 analysis are more different from the other years than the other years are with each other.  
reg pre1970 pre1968 if year==1968
reg pre1970 pre1972 if year==1968
reg pre1970 pre1974 if year==1968
*corroborates expectation
reg pre1994 pre1992 if year==1992
reg pre1994 pre1996 if year==1992
reg pre1994 pre1998 if year==1992
*corroborates expectation
*scatter pre1994 pre1992 if year==1992
*pretty ugly
*scatter pre1996 pre1998 if year==1998
*Much better, since almost no redistricting.  
reg pre1998 pre1996 if year==1996
reg pre1998 pre1998 if year==1996
reg pre1998 pre2000 if year==1996
*all three pairs are highly correlated when there isn't redistricting, as expected.  

*COMPILE
gen pre1drop=.
forvalues ddd=1968(2)2016 {
replace pre1drop=pre`ddd' if year==`ddd'
drop pre`ddd'
}
save $drop1file$datezzz, replace

*MAINFILE
*In mainfile, compute contest2, and then bring contest2 and altobs into the drop1file.  Also bring dper.  
clear
use $mainfile

*CONTEST2
gen contest2=contest
replace contest2="bigthirdlag" if contest=="bigthirdlag1"|contest=="bigthirdlag2"
replace contest2="newcands" if contest=="newcandsdwinner"|contest=="newcandsrwinner"
replace contest2="repeatcontest" if contest=="repeatcontestdwin"|contest=="repeatcontestrwin"
replace contest2="repeatloser" if contest=="repeatdloser"|contest=="repeatrloser"
replace contest2="repeatwinner" if contest=="repeatdwinner"|contest=="repeatrwinner"
replace contest2="unclassified" if contest2=="unclassifiedforthemoment"|contest2=="speciallagged"
replace contest2="uncontlag" if contest2=="uncontbothtimes"|contest2=="uncontlasttime"
replace contest2="puncontlag" if contest2=="partuncontlasttime"
replace contest2="notused" if contest=="bigthird1"|contest=="bigthird2"|contest=="dontuse"|contest=="uncontthistime"
replace contest2="redist" if contest=="altholdoverredist"
tab contest2
save $mainfile, replace



clear
use $mainfile
drop if year==2018
keep contest2 altobs dper year sid sen dname dno geopost mmdpost
merge 1:1 year sid sen dname dno geopost mmdpost using $drop1file$datezzz
tab contest2 _merge
*That looks good.  
*The only red flag is that contest2=redist&_merge=1 for a lot of cases, but I'm hoping they're uncontested.  There are 8842 cases of those.
sum dper if contest2=="redist"&_merge==1&(dper==0|dper==100)
*uncontested elections account for all but 17, good.  
*DEAL WITH THE ABOVE IN CONTEST SOMETIME BUT IT WON'T HURT ANYTHING NOW.  THERE IS SOME OTHER WEIRDNESS IN THE CROSS-TAB ABOVE, BUT I'LL DEAL WITH IT LATER AND I THINK IT'S SIMILAR TO THE IMMEDIATELY ABOVE ISSUE.  
drop if _merge==1
drop _merge
save $drop1file$datezzz, replace









clear
use $drop1file$datezzz

*CONSOLIDATE
*If there aren't many cases in a cat of contest2-altobs, collapse them with unclassified.
gen contest3=contest2
replace contest3="sysmislag" if contest2=="bigthirdlag"|contest2=="dontuselag"|contest2=="firstcase"|contest2=="redist"|contest2=="uncontlag"

*Just collapse altobs=partyheldobs in with "none" since there are so few cases of altobs=partyheldobs.  
gen altobs1=altobs=="voteobs"
drop altobs
rename altobs1 altobs
levelsof(contest3), clean local(contest3)
foreach string of local contest3 {
gen `string'=contest3=="`string'"
gen `string'2=`string'*altobs
}

gen r1=dper-pre1drop
egen yearmean=mean(r1), by(year)
gen r2=r1-yearmean
egen statemean=mean(r2), by(year sid)
gen r3=r2-statemean
sum yearmean statemean r3
*SDs look a lot like the three levels of error from the hlm.  

*MAE
gen ae1=abs(r1)
gen ae2=abs(r2)
gen ae3=abs(r3)

*Run through each variable derived from contest3 in turn and make it the default category.
local xvars ffammd repeatcontest puncontlag repeatwinner repeatloser newcands sysmislag
foreach x of local xvars {
di "OMITTED CATEGORY BELOW IS `x'"
local allbut: list xvars - x
reg ae3 altobs `allbut' unclassified
di "OMITTED CATEGORY ABOVE IS `x'"
di ""
}

*Run through each variable derived from contest3 in turn and make it the default category for the battery of dummy variables.  However, do not do this with the interactions between altobs and the category in question.  The difference between the default cat and those is what is being assessed in the following.  
local xvars ffammd repeatcontest puncontlag repeatwinner repeatloser newcands sysmislag
foreach x of local xvars {
di "OMITTED CATEGORY BELOW IS `x'"
local allbut: list xvars - x
reg ae3 altobs `allbut' unclassified ffammd2 newcands2 puncontlag2 repeatcontest2 repeatloser2 repeatwinner2 sysmislag2
di "OMITTED CATEGORY ABOVE IS `x'"
di ""
}
*when comparing them pair-wise, none of them are SSG.  

*no omitted cat in following, I allow stata to throw one out as a check, which it did, good.  
reg ae3 altobs ffammd repeatcontest puncontlag repeatwinner repeatloser newcands sysmislag unclassified
predict ae3_sans_dist_factors, r

bysort contest3 altobs: gen id=1 if _n==1

replace id=sum(id)
bysort id: gen cases=_N
tab id cases
*there are of course the groups with n=8 and 10, but they don't matter.  The next smallest is 144, and the next is 288.  No problem.
egen sd=sd(r3), by(id)
egen mean=mean(ae3), by(id)
reg sd mean
*LAST TIME IT WAS: sd=.1476896 + (1.265293*mean), r2=.9993
*THIS TIME IT IS: sd=.1629688 + (1.261702*mean), r2=.9994
drop sd mean

save 113erroranalysis$datezzz, replace

*UOA=YEAR
*Collapse the file to make some graphs and grab the lowess estimates.  
collapse (mean) ae3_sans_dist_factors ae3 (sd) sdae3_sans_dist_factors=ae3_sans_dist_factors , by(year)
quietly lowess ae3_sans_dist_factors year, gen(lowessest)
gen time=year-1968
gen sq=time*time
gen cube=sq*time
gen quar=cube*time
reg ae3_sans_dist_factors time
predict est1
reg ae3_sans_dist_factors time sq
predict est2
reg ae3_sans_dist_factors time sq cube
predict est3
reg ae3_sans_dist_factors time sq cube quar
predict est4
save temp, replace

twoway ///
(lowess ae3 year, lcolor(sienna) lwidth(thick) xlabel(1968(8)2016) ylabel(0(2)10) xtitle("") xtitle("") xsize(1) ysize(1.2) graphregion(color(stone)) ) ///
(qfit ae3 year, lcolor(eltblue) lwidth(thick)) ///
(scatter ae3 year, mcolor(black) msymbol(o) msize(medium)) ///
, title("Figure 1: Mean Absolute Value of" "District Level Error by Year") ///
legend(label (1 "Lowess Trend") label (2 "Quadratic Trend") label (3 "Mean Abs Dist Error")) ///
ytitle("")
graph export 114figure1$datezzz.png, width(13333) height(16000) replace

twoway ///
(lowess ae3_sans_dist_factors year, lcolor(sienna) lwidth(thick) xlabel(1968(8)2016) ylabel(-1(.2)1) xtitle("") xtitle("") xsize(1) ysize(1.2) graphregion(color(stone)) ) ///
(qfit ae3_sans_dist_factors year, lcolor(eltblue) lwidth(thick)) ///
(scatter ae3_sans_dist_factors year, mcolor(black) msymbol(o) msize(medium)) ///
, title("Figure 2: Mean Absolute Value of" "District Level Error Sans" "District Level Sources of Error by Year") ///
legend(label (1 "Lowess Trend") label (2 "Quadratic Trend") label (3 "Mean Error Sans District Factors")) ///
ytitle("")
graph export 115figure2$datezzz.png, width(13333) height(16000) replace

twoway ///
(scatter est3 year, mcolor(sienna) msymbol(o) msize(medium) xlabel(1968(8)2016) ylabel(-1(.2)1) xtitle("") xtitle("") xsize(1) ysize(1.2) graphregion(color(stone)) ) ///
(scatter est4 year, mcolor(eltblue) msymbol(o) msize(medium)) ///
(scatter ae3_sans_dist_factors year, mcolor(black) msymbol(o) msize(medium)) ///
, title("Figure 3: Mean Absolute Value of" "District Level Error Sans" "District Level Sources of Error by Year") ///
legend(label (1 "Cubic Trend") label (2 "Quartic Trend") label (3 "Mean Error Sans District Factors")) ///
ytitle("")
graph export 116figure3$datezzz.png, width(13333) height(16000) replace
*The cubic relationship looks very slightly better than the quartic relationship.  I'm going to use that.  
clear






*YEAR ERROR
*Get the year error to use in the simulation.  
clear
use 113erroranalysis$datezzz
collapse (mean) r1, by(year)
gen c=1
collapse (sd) r1, by(c)
rename r1 yearerrorsd
save tempyearerror, replace

*STATE-YEAR ERROR
*Get the state-year error to use in the simulation.  
clear
use 113erroranalysis$datezzz
collapse (mean) r2, by(year sid)
gen c=1
collapse (sd) r2, by(c)
rename r2 stateyearerrorsd
save tempstateyearerror, replace




*ERROR BY CONTEST & YEAR
*Estimate amount of expected district level error by type of contest.  

*ERRORANALYSISFILE
*First bring the necessary variables from the error analysis file into the main file.  
clear
use 113erroranalysis$datezzz
keep year sid sen dname dno geopost mmdpost ae3 pre1drop
save temp, replace

*MAINFILE
clear
use $mainfile

*MERGE
merge 1:1 year sid sen dname dno geopost mmdpost using temp
tab contest2 _merge
*looks good.  
assert _merge!=2
drop _merge
erase temp.dta

save 117mainfile$datezzz, replace




clear
use 117mainfile$datezzz

*CONTEST3
*Recompute contest3 in this file, it wasn't computed here.  
*Since altobs is only being used additively, break it into two dummy vars instead of one.  
*CONSOLIDATE
*If there aren't many cases in a cat of contest2-altobs, collapse them with unclassified.
*PROBLEM: Why did I lump uncontested into the same cat as redistricting above?  There is a huge difference.  Just assume that I can break them into separate cats with no problem.  
gen contest3=contest2
replace contest3="sysmislag" if contest2=="bigthirdlag"|contest2=="dontuselag"|contest2=="firstcase"|contest2=="redist"
levelsof(contest3), clean local(contest3)
foreach string of local contest3 {
gen `string'zzz=contest3=="`string'"
}
gen altobs1=altobs=="voteobs"
gen altobs2=altobs=="partyheldobs"

*REGRESSION
*Regression estimates expected absolute value of district level error by category and by year.  
gen time=year-1968
*Make 2018 the same as 2016
replace time=48 if year==2018
gen sq=time*time
gen cube=sq*time
reg ae3 altobs1 altobs2 ffammdzzz puncontlagzzz repeatwinnerzzz repeatloserzzz newcandszzz uncontlagzzz sysmislagzzz unclassifiedzzz time sq cube
predict expectederror
sum sid expectederror if year==2018&needest==1
*completely obs, good
tab contest3 if year==2018&needest==1
*translate MAE into SD
*Last time: sd=.1476896 + (1.265293*mean), r2=.9993
*I thought the above would be general and wouldn't have to be altered for different models.  The coefficients are a little different this time (oct 30, 2018), though, perhaps because of rounding error.  
*THIS TIME IT IS: sd=.1629688 + (1.261702*mean), r2=.9994
gen disterrorsd=(expectederror*1.261702)+.1629688
tab contest3 disterrorsd if year==2018&needest==1
*looks reasonable.  
*cleanup
drop ffammdzzz newcandszzz notusedzzz puncontlagzzz repeatcontestzzz repeatloserzzz repeatwinnerzzz sysmislagzzz unclassifiedzzz uncontlagzzz altobs1 altobs2 time sq cube expectederror

*Bring in year and state-year error SDs.
gen c=1
merge m:1 c using tempyearerror
drop _merge
merge m:1 c using tempstateyearerror
drop _merge c

save 117mainfile$datezzz, replace





*2018 FORECAST
*There is some weird formatting associated with the prior mulitple imputation.  I should have used "mi extract" before, but now that I've removed _mi_m, I'm unsure how to do this appropriately.  I'm merely going to export this as a cvs file and then read it back in.  The only thing that was brought into the main dataset were the predicted values, so it's not like I'm bringing imputed values into the main dataset.  
clear
use 117mainfile$datezzz
export delimited temp117mainfile$datezzz.csv, replace
clear
import delimited temp117mainfile$datezzz.csv
erase temp117mainfile$datezzz.csv
save temp117mainfile$datezzz, replace


*DROP
clear
use temp117mainfile$datezzz
drop if (uncont==1|dontuse==1|bigthird!=0)&year<2018
gen dper2=dper
gen dperlag2=dperlag if (dontuselag==0&bigthirdlag==0&year<2018)|year==2018

local varlist dper2 dperlag2 cand candlag2 incuncontlag2 incperiod1 incperiod2 incperiod3 incperiod4 incperiod1lag incperiod2lag incperiod3lag incperiod4lag inc2 inc3 other otherlag otheruncontlag2 past pastlag pastuncontlag2 switch switchwin stealth stealthwin sen wave wavelag smidpen smidpenlag rcontest rcontestb rcontestdperlag rcontestinclag rwinner rwinnerb rwinnerdperlag rwinnerinclag rloser rloserb rloserdperlag rloserinclag altlagpossible altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont year yearsid
keep `varlist' sid dname dno geopost mmdpost contest altobs contest3 disterrorsd yearerrorsd stateyearerrorsd

*Create two local macros.  One has a list of fully observed variables, the other has a list of variables that aren't fully observed.
local missvars ""
local obsvars ""
foreach string of local varlist {
sum sid if `string'==.
local aaa=r(N)
if `aaa'!=0 {
local missvars `missvars' `string'
}
else {
local obsvars `obsvars' `string'
}
}
di "`missvars'"
di "`obsvars'"
mi set mlong
mi register imputed `missvars'
mi impute mvn `missvars' = `obsvars', add(10) rseed (2495211)
replace dper2=. if year==2018
mi estimate, saving(118finalmodelests$datezzz, replace): xtmixed dper2 dperlag2 cand candlag2 incuncontlag2 incperiod1 incperiod2 incperiod3 incperiod4 incperiod1lag incperiod2lag incperiod3lag incperiod4lag inc2 inc3 other otherlag otheruncontlag2 past pastlag pastuncontlag2 switch switchwin stealth stealthwin sen wave wavelag smidpen smidpenlag rcontest rcontestb rcontestdperlag rcontestinclag rwinner rwinnerb rwinnerdperlag rwinnerinclag rloser rloserb rloserdperlag rloserinclag altlagpossible altlagdper altlagcand altlaginc altlagincuncont altlagother altlagotheruncont altlagpast altlagpastuncont || year: || yearsid:
mi predict prefinal using 118finalmodelests$datezzz
save 118finalmodelests$datezzz, replace


*MIEXTRACT
clear
use 118finalmodelests$datezzz
mi extract 0
keep year sid sen dname dno geopost mmdpost prefinal
save temp, replace


global mainfile2 119mainfile$datezzz


*FFA-MMD VOTES TO SEATS SIMULATION
*For FFA-MMDs, a given vote share sometimes translates into different percentages of seats for one party or the other.  
*Estimate models for two seat FFA-MMDs.  The states with FFAMMDs with three or more seats aren't interesting politically: MD, NH, VT and WV, so don't worry about FFAMMDs with more than two seats.  (Oct 30, 2018 note: I see I was wrong to say that about NH and WV.)
*For FFA-MMDs that are three seats or more, assign them all to the party that sweeps the district.  
*Y=# of seats won by Dems
*X1=% of vote for the Dems
*X2=sweep: party receiving 50% or more of the vote.  
*X3=cand: dems as a proportion of seats minus repubs as a prop of seats
*Given the same vote share in an FFA-MMD, a party will receive more seats if they have FEWER candidates running.  Their votes are more concentrated.  
*X4=inc: dem incs as a proportion of seats minus repub incs as a proportion of seats.  Given the same vote share, the more incumbents a party has, the more seats they'll attain.  
*create variables.
clear
use temp117mainfile$datezzz

merge 1:1 year sid sen dname dno geopost mmdpost using temp
*merge looks good.
assert _merge!=2
drop _merge
erase temp.dta

*DPER2
gen dper2=dper if uncont==0&dontuse==0&bigthird==0&year<2018
gen dperlag2=dperlag if (dontuselag==0&bigthirdlag==0&year<2018)|year==2018

gen demseats=eseats2*dwin
recode dper2 (min/50=0) (50/max=1), gen(sweep)
gen situation=1 if eseats2==2&uncont==0&bigthird==0&cand==-.5
replace situation=2 if eseats2==2&uncont==0&bigthird==0&cand==0&dper2<50
replace situation=3 if eseats2==2&uncont==0&bigthird==0&cand==0&dper2!=.&dper2>50
replace situation=4 if eseats2==2&uncont==0&bigthird==0&cand==.5
bysort situation: tab demseats
*That looks great, only one outcome for each situation.  
tab demseats if situation==1&dper2!=.&dper2>((1/3)*100)
*always 1, good.  
tab demseats if situation==4&dper2<((2/3)*100)
*always 1, good.  
replace situation=. if situation==1&dper2!=.&dper2>((1/3)*100)
replace situation=. if situation==4&dper2<((2/3)*100)
*The problem for the simulation is that while situation 1 and situation 4 won't change (although they might change between having a determined outcome and a random outcome), situations 2 and 3 are interchangeable depending on simulated vote share.  
*temporarily fill in values for dper2 for 2018 so that the predicted value sans dper2 contribution can be computed.
replace dper2=50 if year==2018
forvalues aaa=1/4 {
ologit demseats dper2 inc if situation==`aaa'
predict opre`aaa', xb
gen ocoef`aaa'=_b[dper2]
replace opre`aaa'=opre`aaa'-(ocoef`aaa'*dper2)
gen thresh`aaa'=_b[/cut1]
}
drop demseats situation

save $mainfile2, replace




*SIMULATION-UOADIST
*assuming that every seat is accounted for (each is) you only need to save dseats as they are tallied.  
*VOTESHARESIM
clear
use $mainfile2
*drop states where the data is incomplete.
keep if year==2018
keep sid sen dname dno geopost mmdpost dheld2 rheld2 prefinal eseats eseats2 cand uncont ocoef1 opre1 thresh1 ocoef2 opre2 thresh2 ocoef3 opre3 thresh3 ocoef4 opre4 thresh4 irc dem50 needest exception weirdcaucus disterrorsd yearerrorsd stateyearerrorsd
forvalues aaa=1/2000 {
gen naterr=rnormal()*yearerrorsd
gen stateerr=rnormal()*stateyearerrorsd
bysort sid sen: replace stateerr=stateerr[1]
gen disterr=rnormal()*disterrorsd
gen sd`aaa'=prefinal+naterr+stateerr+disterr
replace sd`aaa'=. if needest==0
drop naterr stateerr disterr
}
recode sd1-sd2000 (min/0=0) (100/max=100)
egen dperp05=rowpctile(sd1-sd2000), p(5)
egen dperp33=rowpctile(sd1-sd2000), p(33)
egen dperp50=rowpctile(sd1-sd2000), p(50)
egen dperp67=rowpctile(sd1-sd2000), p(67)
egen dperp95=rowpctile(sd1-sd2000), p(95)
save 104simUOADistPer$datezzz, replace


*SEATSIM-UOADIST
*Simulation results in percent of times the Democrats will win a particular number of seats in a district-post.  
clear
use 104simUOADistPer$datezzz
drop dperp05 dperp33 dperp50 dperp67 dperp95
order sid sen dname dno geopost mmdpost eseats2 dheld2 
forvalues aaa=1/2000 {
*Certain vote shares and numbers of seats to be won deterministically translate into a number of seats in a district for a party.  Those are done first, before engaging the second simulation stage.  The translations for ffammds with three seats or more are done at the same time.  
recode sd`aaa' (min/50=0) (50/max=1) if eseats2!=2
replace sd`aaa'=sd`aaa'*eseats2 if eseats2!=2&needest==1
*Take repub in ny assembly who caucuses with the dems into account.  
local runi=runiform()
replace sd`aaa'=1 if `runi'<.5&irc==1
*AK HS 31
local runi=runiform()
replace sd`aaa'=1 if `runi'<.5&dem50==1
*AK HS 32
local runi=runiform()
replace sd`aaa'=1 if `runi'<.5&dem50==2
*FFAMMDs
*Translate votes into seats for two seat FFA-MMDs here
*first for situation=1 where the dem gets more than 33.33% of the vote.  
gen temp=eseats2==2&cand==-.5&sd`aaa'>((1/3)*100)&sd`aaa'!=.
replace sd`aaa'=1 if temp
replace temp=eseats2==2&cand==.5&sd`aaa'<((2/3)*100)
replace sd`aaa'=1 if temp
*assign the remaining cases to which situation it is (1 through 4)
gen tempsit=eseats2==2&cand==-.5&sd`aaa'<((1/3)*100)
replace tempsit=2 if eseats2==2&uncont==0&cand==0&sd`aaa'<50
replace tempsit=3 if eseats2==2&uncont==0&cand==0&sd`aaa'>50&sd`aaa'!=.
replace tempsit=4 if eseats2==2&cand==.5&sd`aaa'>((2/3)*100)&sd`aaa'!=.
gen tempprob=0
forvalues ccc=1/4 {
replace tempprob=1/(1+exp(opre`ccc'+(ocoef`ccc'*sd`aaa')-thresh`ccc')) if tempsit==`ccc'
}
sum tempprob
gen runi=runiform()
replace sd`aaa'=0 if tempsit!=0&tempsit!=.&(tempprob>runi)&tempprob!=.
replace sd`aaa'=1 if tempsit!=0&tempsit!=.&(tempprob<runi)
replace sd`aaa'=sd`aaa'+1 if (tempsit==3|tempsit==4)
*Adjust the NH & VT cases in light of the independents who are assumed to win and assumed to caucus with the dems.  This must be done after the votes to seats simulation for two seat contests (technically, two seat eseats2, rather).  
replace sd`aaa'=sd`aaa'+dheld2 if needest==1&exception==1&eseats2!=2&eseats2!=.
*The next line brings in all the cases where needest=0.  This has to be done before the votes to seats translation for two seat contests above.  
replace sd`aaa'=dheld2 if needest==0
*clean up
drop temp tempsit tempprob runi
}
forvalues ccc=1/4 {
drop opre`ccc' ocoef`ccc' thresh`ccc'
}
*The following tracks the proportion of times a seat will be won by a given number of Democrats.  
sum eseats
local ddd=r(max)
forvalues ccc=0/`ddd' {
egen numdemsis`ccc'=anycount(sd1-sd2000), values(`ccc')
replace numdemsis`ccc'=(numdemsis`ccc'/2000)*100
replace numdemsis`ccc'=. if eseats<`ccc'
}
compress
save 105simUOADistSeats$datezzz, replace

*SIMULATION-UOACHAM
*SEATSIM
clear
use 105simUOADistSeats$datezzz
collapse (sum) eseats sd1-sd2000, by(sid sen)
*reshape
reshape long sd, i(sid sen) j(col)
rename col iteration
rename sd dseats
gen dseatper=(dseats/eseats)*100
gen dcont=dseatper>50
gen rcont=dseatper<50
gen tcont=dseatper==50
*NOV 1, 2018 NOTE: The following was done incorrectly.  For the 3/5ths and 3/4ths override requirements, only 3/5ths is necessary to override, not 3/5ths + 1 vote.  
*veto thresholds
recode sid (8=99) (13=99) (20=99) (27=99) (33=99) (35=99) (39=99), gen(temp1)
recode temp1 (1/50=0) (99=1)
recode sid (14=99) (48=99) (17=99) (42=99) (4=99) (1=99), gen(temp2)
recode temp2 (1/50=0) (99=1)
gen dvetoproof=dseatper>((2/3)*100) if temp1==0&temp2==0
gen rvetoproof=dseatper<((1/3)*100) if temp1==0&temp2==0
replace dvetoproof=dseatper>60 if temp1==1
replace rvetoproof=dseatper<40 if temp1==1
replace dvetoproof=dseatper>50 if temp2==1
replace rvetoproof=dseatper<50 if temp2==1
drop temp1 temp2
*super-majority budget thresholds
gen dbudg=dcont
gen rbudg=rcont
replace dbudg=dseatper>75 if sid==4
replace rbudg=dseatper<25 if sid==4
replace dbudg=dseatper>((2/3)*100) if sid==27|sid==39
replace rbudg=dseatper<((1/3)*100) if sid==27|sid==39
*super-majority tax increase thresholds
gen dtax=dcont
gen rtax=rcont
replace dtax=dseatper>((2/3)*100) if sid==3|sid==5|sid==18|sid==25|sid==28|sid==41|sid==49
replace dtax=dseatper>75 if sid==4|sid==22|sid==36
replace dtax=dseatper>60 if sid==8|sid==17|sid==24|sid==37
replace rtax=dseatper<((1/3)*100) if sid==3|sid==5|sid==18|sid==25|sid==28|sid==41|sid==49
replace rtax=dseatper<25 if sid==4|sid==22|sid==36
replace rtax=dseatper<40 if sid==8|sid==17|sid==24|sid==37
compress
save 106simUOACham$datezzz, replace







*UOACHAM
clear
use 106simUOACham$datezzz
collapse (p5) p5=dseatper (p33) p33=dseatper (p50) p50=dseatper (p67) p67=dseatper (p95) p95=dseatper (mean) dcont tcont rcont dvetoproof rvetoproof dbudg rbudg dtax rtax, by(sid sen)
foreach string in dcont rcont tcont dvetoproof rvetoproof dbudg rbudg dtax rtax {
replace `string'=`string'*100
}
merge m:1 sid using 009_StateCodes
drop if _merge==2
drop _merge
merge 1:1 sid sen using 010jacobson20180502
drop if _merge==2
drop _merge
gen cham="S" if sen==1
replace cham="H" if sen==0
gen keychamber=1 if sid==3|(sid==6&sen==1)|sid==9|sid==22|(sid==23&sen==0)|(sid==29&sen==1)|sid==38|(sid==43&sen==0)|sid==49
order state cham dcont tcont rcont keychamber dvetoproof rvetoproof dbudg rbudg dtax rtax
sort keychamber state cham
compress
save 107ChamberForecasts$datezzz, replace
export delimited 107ChamberForecasts$datezzz.csv, replace

*Merge 2018 info with district level simulation results
*Get simulation results ready to merge
clear
use 105simUOADistSeats$datezzz
drop sd1- sd2000 eseats dheld2 prefinal
save temp, replace

*get data from 2018 to merge
clear
use $mainfile2
keep if year==2018
keep sid sen dname dno geopost mmdpost nextup prefinal eseats dheld rheld oheld dcand rcand ocand dinc rinc oinc dother rother oother dpast rpast opast partuncont mixeduncont uncont dperlag candlag inclag otherlag pastlag dontuselag dtype nest nest1 nest2 nest3 bigthird bigthirdlag switchlag stealthlag
sort sid sen dname dno geopost mmdpost
*merge simulation results
merge 1:1 sid sen dname dno geopost mmdpost using temp
drop if _merge==1
assert _merge==3
drop _merge
erase temp.dta
*merge in state names
merge m:1 sid using 009_StateCodes
drop if _merge==2
assert _merge==3
drop _merge
*cham
gen cham="S" if sen==1
replace cham="H" if sen==0
sort sid sen dname dno geopost mmdpost
gen dtype2="smd" if dtype==1
replace dtype2="postmmd" if dtype==2
replace dtype2="ffammd" if dtype==3
replace dtype2="alternating" if dtype==4
replace dtype2="alternatingpostmmd" if dtype==5
replace dtype2="alternatingffammd" if dtype==6
replace dtype2="smdfloterial" if dtype==7
replace dtype2="postmmdfloterial" if dtype==8
replace dtype2="ffammdfloterial" if dtype==9
foreach string in dcand rcand ocand dinc rinc oinc dother rother oother dpast rpast opast {
replace `string'=round(`string'*eseats)
tostring `string', replace
replace `string'="" if `string'=="0"
}
*The following were already in terms of seats, not proportion of seats up.  
foreach string in dheld rheld oheld {
tostring `string', replace
replace `string'="" if `string'=="0"
}
foreach string in held cand inc other past {
gen `string'b=d`string'+"D" if d`string'!=""
replace `string'b=`string'b+" "+r`string'+"R" if r`string'!=""
replace `string'b=`string'b+" "+o`string'+"O" if o`string'!=""
replace `string'b=stritrim(strtrim(`string'b))
drop d`string' r`string' o`string'
}
*The following should be blank if the seat isn't up in 2018
foreach string in cand inc other past {
replace `string'b="" if nextup!=2018
}
rename heldb current_party
rename candb cands_this_election
rename incb incumbents_this_election
rename otherb other_chamber_cands_this_elect
rename pastb previous_leg_cand_this_elect

*assert dontuselag!=1
*list if dontuselag==1
*there are two exceptions, find these and deal with them some other time.  I looked at these at the end, and they didn't appear to cause any problems.  
*DEAL WITH THE ABOVE SOME OTHER TIME

*Bring in more candidate attributes
*The following vars are brought in here: dcandname rcandname dtenure1 rtenure1 dtenure2 rtenure2
merge 1:1 sid sen dname dno geopost mmdpost using tempcandattributes
erase tempcandattributes.dta
*There will be merge=1, and that's okay.  
assert _merge!=2
drop if _merge==2
drop _merge

*CAUCUSESWITH
gen caucuseswith=""
replace caucuseswith="R caucuses with D" if sid==2&sen==0&(dno==31|dno==32)
replace caucuseswith="Indep caucuses with D" if (sid==4&sen==0&dno==11)
replace caucuseswith="Indep caucuses with D" if sid==19&sen==0&(dno==82|dno==94|dno==120)
replace caucuseswith="Lib caucuses with R" if sid==29&sen==0&((dname=="hillsborough"&dno==37)|(dname=="strafford"&dno==24))
replace caucuseswith="Indep caucuses with R" if (sid==39&sen==0&dno==36)|(sid==45&dname=="windhambenningtonwindsor")
replace caucuseswith="Indep caucuses with D" if sid==45&sen==0&[(dname=="addisonrutland")|(dname=="franklin"&dno==2)|(dname=="orangewashingtonaddison")|(dname=="washington"&dno==3)|(dname=="windhambennington")]
replace caucuseswith="R caucuses with D" if sid==32&[(sen==1&dno==17)|(sen==0&dno==62)|(sen==0&dno==142)]
replace caucuseswith="normal" if caucuseswith==""
tab caucuseswith weirdcaucus
*looks cool.  
*list sid sen dname dno geopost mmdpost caucuseswith weirdcaucus if caucuseswith!="normal"|weirdcaucus==1
*That checks out.  weirdcaucus and caucuseswith agree, and weirdcaucus agrees with my records.  
drop weirdcaucus

sort state cham dname dno geopost mmdpost
order ///
state ///
cham ///
dname ///
dno ///
geopost ///
mmdpost ///
dcandnamea ///
rcandnamea ///
prefinal ///
numdemsis0 ///
numdemsis1 ///
numdemsis2 ///
numdemsis3 ///
numdemsis4 ///
numdemsis5 ///
numdemsis6 ///
numdemsis7 ///
numdemsis8 ///
numdemsis9 ///
numdemsis10 ///
numdemsis11 ///
dtype2 ///
nextup ///
dperlag ///
current_party ///
caucuseswith ///
cands_this_election ///
incumbents_this_election ///
other_chamber_cands_this_elect ///
previous_leg_cand_this_elect ///
eseats ///
dcandnameb ///
rcandnameb ///
dtenure1a ///
rtenure1a ///
dtenure2a ///
rtenure2a ///
candlag ///
inclag ///
otherlag ///
pastlag ///
stealthlag ///
bigthirdlag ///
dcandnameb ///
rcandnameb ///
dtenure1b ///
rtenure1b ///
dtenure2b ///
rtenure2b ///
nest ///
nest1 ///
nest2 ///
nest3 ///
sab ///
sid ///
sfips ///
sen

*DROP
drop ///
partuncont ///
mixeduncont ///
uncont ///
cand ///
dtype ///
stealthlag ///
bigthird ///
switchlag ///
dontuselag ///
eseats2 ///
dem50 ///
rheld2 ///
exception ///
irc ///
needest

save 108DistrictForecasts$datezzz, replace
export delimited 108DistrictForecasts$datezzz.csv, replace





